From 127f9229b5b7c1e22f339180bfbd01fea01641eb Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Fri, 21 Mar 2025 17:30:33 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- FAQS.html | 12 +- TODO.html | 8 +- docs/amd_hpc.html | 12 +- docs/api/cli.args.html | 957 +++++++ docs/api/cli.checks.html | 962 +++++++ docs/api/cli.cloud.base.html | 904 +++++++ docs/api/cli.cloud.modal_.html | 930 +++++++ docs/api/cli.config.html | 1193 +++++++++ docs/api/cli.evaluate.html | 982 +++++++ docs/api/cli.inference.html | 1061 ++++++++ docs/api/cli.main.html | 1256 +++++++++ docs/api/cli.merge_lora.html | 1001 +++++++ docs/api/cli.merge_sharded_fsdp_weights.html | 1050 ++++++++ docs/api/cli.preprocess.html | 980 +++++++ docs/api/cli.sweeps.html | 971 +++++++ docs/api/cli.train.html | 982 +++++++ docs/api/cli.utils.html | 1376 ++++++++++ docs/api/common.architectures.html | 842 ++++++ docs/api/common.const.html | 842 ++++++ docs/api/common.datasets.html | 1133 ++++++++ docs/api/convert.html | 960 +++++++ docs/api/core.chat.format.chatml.html | 842 ++++++ docs/api/core.chat.format.llama3x.html | 842 ++++++ docs/api/core.chat.format.shared.html | 842 ++++++ docs/api/core.chat.messages.html | 1014 ++++++++ docs/api/core.datasets.chat.html | 914 +++++++ ...core.datasets.transforms.chat_builder.html | 985 +++++++ docs/api/core.trainer_builder.html | 983 +++++++ docs/api/core.trainers.base.html | 1042 ++++++++ docs/api/core.trainers.dpo.trainer.html | 932 +++++++ docs/api/core.trainers.grpo.trainer.html | 904 +++++++ docs/api/core.trainers.trl.html | 954 +++++++ docs/api/core.training_args.html | 1282 +++++++++ docs/api/datasets.html | 930 +++++++ docs/api/evaluate.html | 1040 ++++++++ docs/api/index.html | 1459 +++++++++++ docs/api/integrations.base.html | 1357 ++++++++++ .../integrations.cut_cross_entropy.args.html | 904 +++++++ docs/api/integrations.grokfast.optimizer.html | 841 ++++++ docs/api/integrations.kd.trainer.html | 939 +++++++ docs/api/integrations.liger.args.html | 904 +++++++ docs/api/integrations.lm_eval.args.html | 904 +++++++ docs/api/integrations.spectrum.args.html | 904 +++++++ docs/api/kernels.geglu.html | 1040 ++++++++ docs/api/kernels.lora.html | 2100 +++++++++++++++ docs/api/kernels.quantize.html | 1004 +++++++ docs/api/kernels.swiglu.html | 1037 ++++++++ docs/api/kernels.utils.html | 842 ++++++ docs/api/logging_config.html | 930 +++++++ docs/api/models.mamba.modeling_mamba.html | 841 ++++++ docs/api/monkeypatch.attention.mllama.html | 926 +++++++ .../monkeypatch.btlm_attn_hijack_flash.html | 842 ++++++ ...onkeypatch.data.batch_dataset_fetcher.html | 842 ++++++ .../monkeypatch.llama_attn_hijack_flash.html | 1127 ++++++++ ...onkeypatch.llama_attn_hijack_xformers.html | 842 ++++++ docs/api/monkeypatch.llama_expand_mask.html | 842 ++++++ .../monkeypatch.llama_patch_multipack.html | 842 ++++++ docs/api/monkeypatch.lora_kernels.html | 1288 +++++++++ ...monkeypatch.mistral_attn_hijack_flash.html | 1069 ++++++++ docs/api/monkeypatch.mixtral.html | 842 ++++++ docs/api/monkeypatch.multipack.html | 842 ++++++ docs/api/monkeypatch.relora.html | 922 +++++++ ...onkeypatch.stablelm_attn_hijack_flash.html | 915 +++++++ docs/api/monkeypatch.trainer_fsdp_optim.html | 904 +++++++ .../monkeypatch.transformers_fa_utils.html | 964 +++++++ docs/api/monkeypatch.unsloth_.html | 842 ++++++ docs/api/monkeypatch.utils.html | 927 +++++++ docs/api/prompt_strategies.alpaca_chat.html | 959 +++++++ .../prompt_strategies.alpaca_instruct.html | 842 ++++++ .../prompt_strategies.alpaca_w_system.html | 952 +++++++ docs/api/prompt_strategies.base.html | 842 ++++++ ...rompt_strategies.bradley_terry.llama3.html | 905 +++++++ docs/api/prompt_strategies.chat_template.html | 975 +++++++ docs/api/prompt_strategies.completion.html | 919 +++++++ .../prompt_strategies.dpo.chat_template.html | 842 ++++++ docs/api/prompt_strategies.dpo.chatml.html | 935 +++++++ docs/api/prompt_strategies.dpo.llama3.html | 935 +++++++ .../prompt_strategies.dpo.passthrough.html | 842 ++++++ .../prompt_strategies.dpo.user_defined.html | 842 ++++++ docs/api/prompt_strategies.dpo.zephyr.html | 842 ++++++ docs/api/prompt_strategies.input_output.html | 919 +++++++ docs/api/prompt_strategies.kto.chatml.html | 926 +++++++ docs/api/prompt_strategies.kto.llama3.html | 926 +++++++ .../prompt_strategies.kto.user_defined.html | 842 ++++++ docs/api/prompt_strategies.llama2_chat.html | 984 +++++++ docs/api/prompt_strategies.messages.chat.html | 910 +++++++ docs/api/prompt_strategies.metharme.html | 920 +++++++ docs/api/prompt_strategies.orcamini.html | 912 +++++++ .../prompt_strategies.orpo.chat_template.html | 1030 ++++++++ docs/api/prompt_strategies.pygmalion.html | 920 +++++++ ...prompt_strategies.stepwise_supervised.html | 916 +++++++ docs/api/prompt_strategies.user_defined.html | 930 +++++++ docs/api/prompt_tokenizers.html | 1132 ++++++++ docs/api/train.html | 1573 +++++++++++ docs/api/utils.bench.html | 907 +++++++ docs/api/utils.callbacks.comet_.html | 907 +++++++ docs/api/utils.callbacks.lisa.html | 845 ++++++ docs/api/utils.callbacks.mlflow_.html | 907 +++++++ docs/api/utils.callbacks.perplexity.html | 927 +++++++ docs/api/utils.callbacks.profiler.html | 904 +++++++ docs/api/utils.chat_templates.html | 1028 ++++++++ docs/api/utils.collators.batching.html | 1140 ++++++++ docs/api/utils.collators.core.html | 842 ++++++ docs/api/utils.collators.mamba.html | 904 +++++++ docs/api/utils.collators.mm_chat.html | 914 +++++++ docs/api/utils.data.pretraining.html | 842 ++++++ docs/api/utils.data.sft.html | 842 ++++++ docs/api/utils.dict.html | 904 +++++++ docs/api/utils.distributed.html | 1011 ++++++++ docs/api/utils.freeze.html | 995 +++++++ .../utils.gradient_checkpointing.unsloth.html | 905 +++++++ docs/api/utils.lora.html | 951 +++++++ docs/api/utils.lora_embeddings.html | 904 +++++++ docs/api/utils.model_shard_quant.html | 916 +++++++ docs/api/utils.models.html | 1158 +++++++++ docs/api/utils.optimizers.adopt.html | 928 +++++++ docs/api/utils.samplers.multipack.html | 914 +++++++ docs/api/utils.schedulers.html | 1177 +++++++++ docs/api/utils.schemas.config.html | 914 +++++++ docs/api/utils.schemas.datasets.html | 996 +++++++ docs/api/utils.schemas.enums.html | 924 +++++++ docs/api/utils.schemas.integrations.html | 954 +++++++ docs/api/utils.schemas.model.html | 924 +++++++ docs/api/utils.schemas.peft.html | 934 +++++++ docs/api/utils.schemas.training.html | 914 +++++++ docs/api/utils.schemas.trl.html | 904 +++++++ docs/api/utils.schemas.utils.html | 987 +++++++ docs/api/utils.tokenization.html | 924 +++++++ docs/api/utils.trainer.html | 1058 ++++++++ docs/batch_vs_grad.html | 20 +- docs/cli.html | 22 +- docs/config.html | 1169 ++++----- docs/custom_integrations.html | 11 +- docs/dataset-formats/conversation.html | 8 +- docs/dataset-formats/index.html | 8 +- docs/dataset-formats/inst_tune.html | 8 +- docs/dataset-formats/pretraining.html | 8 +- docs/dataset-formats/stepwise_supervised.html | 12 +- docs/dataset-formats/template_free.html | 69 +- docs/dataset-formats/tokenized.html | 8 +- docs/dataset_preprocessing.html | 32 +- docs/debugging.html | 29 +- docs/docker.html | 8 +- docs/faq.html | 12 +- docs/fsdp_qlora.html | 11 +- docs/getting-started.html | 20 +- docs/inference.html | 8 +- docs/input_output.html | 8 +- docs/installation.html | 13 +- docs/lora_optims.html | 50 +- docs/lr_groups.html | 15 +- docs/mac.html | 8 +- docs/multi-gpu.html | 8 +- docs/multi-node.html | 8 +- docs/multimodal.html | 11 +- docs/multipack.html | 26 +- docs/nccl.html | 8 +- docs/ray-integration.html | 8 +- docs/reward_modelling.html | 11 +- docs/rlhf.html | 11 +- docs/sequence_parallelism.html | 991 +++++++ docs/torchao.html | 8 +- docs/unsloth.html | 14 +- .../colab-axolotl-example.html | 14 +- index.html | 47 +- search.json | 2298 +++++++++++++++-- sitemap.xml | 626 ++++- src/axolotl/integrations/LICENSE.html | 70 +- .../cut_cross_entropy/ACKNOWLEDGEMENTS.html | 11 +- styles.css | 90 +- 171 files changed, 127099 insertions(+), 1001 deletions(-) create mode 100644 docs/api/cli.args.html create mode 100644 docs/api/cli.checks.html create mode 100644 docs/api/cli.cloud.base.html create mode 100644 docs/api/cli.cloud.modal_.html create mode 100644 docs/api/cli.config.html create mode 100644 docs/api/cli.evaluate.html create mode 100644 docs/api/cli.inference.html create mode 100644 docs/api/cli.main.html create mode 100644 docs/api/cli.merge_lora.html create mode 100644 docs/api/cli.merge_sharded_fsdp_weights.html create mode 100644 docs/api/cli.preprocess.html create mode 100644 docs/api/cli.sweeps.html create mode 100644 docs/api/cli.train.html create mode 100644 docs/api/cli.utils.html create mode 100644 docs/api/common.architectures.html create mode 100644 docs/api/common.const.html create mode 100644 docs/api/common.datasets.html create mode 100644 docs/api/convert.html create mode 100644 docs/api/core.chat.format.chatml.html create mode 100644 docs/api/core.chat.format.llama3x.html create mode 100644 docs/api/core.chat.format.shared.html create mode 100644 docs/api/core.chat.messages.html create mode 100644 docs/api/core.datasets.chat.html create mode 100644 docs/api/core.datasets.transforms.chat_builder.html create mode 100644 docs/api/core.trainer_builder.html create mode 100644 docs/api/core.trainers.base.html create mode 100644 docs/api/core.trainers.dpo.trainer.html create mode 100644 docs/api/core.trainers.grpo.trainer.html create mode 100644 docs/api/core.trainers.trl.html create mode 100644 docs/api/core.training_args.html create mode 100644 docs/api/datasets.html create mode 100644 docs/api/evaluate.html create mode 100644 docs/api/index.html create mode 100644 docs/api/integrations.base.html create mode 100644 docs/api/integrations.cut_cross_entropy.args.html create mode 100644 docs/api/integrations.grokfast.optimizer.html create mode 100644 docs/api/integrations.kd.trainer.html create mode 100644 docs/api/integrations.liger.args.html create mode 100644 docs/api/integrations.lm_eval.args.html create mode 100644 docs/api/integrations.spectrum.args.html create mode 100644 docs/api/kernels.geglu.html create mode 100644 docs/api/kernels.lora.html create mode 100644 docs/api/kernels.quantize.html create mode 100644 docs/api/kernels.swiglu.html create mode 100644 docs/api/kernels.utils.html create mode 100644 docs/api/logging_config.html create mode 100644 docs/api/models.mamba.modeling_mamba.html create mode 100644 docs/api/monkeypatch.attention.mllama.html create mode 100644 docs/api/monkeypatch.btlm_attn_hijack_flash.html create mode 100644 docs/api/monkeypatch.data.batch_dataset_fetcher.html create mode 100644 docs/api/monkeypatch.llama_attn_hijack_flash.html create mode 100644 docs/api/monkeypatch.llama_attn_hijack_xformers.html create mode 100644 docs/api/monkeypatch.llama_expand_mask.html create mode 100644 docs/api/monkeypatch.llama_patch_multipack.html create mode 100644 docs/api/monkeypatch.lora_kernels.html create mode 100644 docs/api/monkeypatch.mistral_attn_hijack_flash.html create mode 100644 docs/api/monkeypatch.mixtral.html create mode 100644 docs/api/monkeypatch.multipack.html create mode 100644 docs/api/monkeypatch.relora.html create mode 100644 docs/api/monkeypatch.stablelm_attn_hijack_flash.html create mode 100644 docs/api/monkeypatch.trainer_fsdp_optim.html create mode 100644 docs/api/monkeypatch.transformers_fa_utils.html create mode 100644 docs/api/monkeypatch.unsloth_.html create mode 100644 docs/api/monkeypatch.utils.html create mode 100644 docs/api/prompt_strategies.alpaca_chat.html create mode 100644 docs/api/prompt_strategies.alpaca_instruct.html create mode 100644 docs/api/prompt_strategies.alpaca_w_system.html create mode 100644 docs/api/prompt_strategies.base.html create mode 100644 docs/api/prompt_strategies.bradley_terry.llama3.html create mode 100644 docs/api/prompt_strategies.chat_template.html create mode 100644 docs/api/prompt_strategies.completion.html create mode 100644 docs/api/prompt_strategies.dpo.chat_template.html create mode 100644 docs/api/prompt_strategies.dpo.chatml.html create mode 100644 docs/api/prompt_strategies.dpo.llama3.html create mode 100644 docs/api/prompt_strategies.dpo.passthrough.html create mode 100644 docs/api/prompt_strategies.dpo.user_defined.html create mode 100644 docs/api/prompt_strategies.dpo.zephyr.html create mode 100644 docs/api/prompt_strategies.input_output.html create mode 100644 docs/api/prompt_strategies.kto.chatml.html create mode 100644 docs/api/prompt_strategies.kto.llama3.html create mode 100644 docs/api/prompt_strategies.kto.user_defined.html create mode 100644 docs/api/prompt_strategies.llama2_chat.html create mode 100644 docs/api/prompt_strategies.messages.chat.html create mode 100644 docs/api/prompt_strategies.metharme.html create mode 100644 docs/api/prompt_strategies.orcamini.html create mode 100644 docs/api/prompt_strategies.orpo.chat_template.html create mode 100644 docs/api/prompt_strategies.pygmalion.html create mode 100644 docs/api/prompt_strategies.stepwise_supervised.html create mode 100644 docs/api/prompt_strategies.user_defined.html create mode 100644 docs/api/prompt_tokenizers.html create mode 100644 docs/api/train.html create mode 100644 docs/api/utils.bench.html create mode 100644 docs/api/utils.callbacks.comet_.html create mode 100644 docs/api/utils.callbacks.lisa.html create mode 100644 docs/api/utils.callbacks.mlflow_.html create mode 100644 docs/api/utils.callbacks.perplexity.html create mode 100644 docs/api/utils.callbacks.profiler.html create mode 100644 docs/api/utils.chat_templates.html create mode 100644 docs/api/utils.collators.batching.html create mode 100644 docs/api/utils.collators.core.html create mode 100644 docs/api/utils.collators.mamba.html create mode 100644 docs/api/utils.collators.mm_chat.html create mode 100644 docs/api/utils.data.pretraining.html create mode 100644 docs/api/utils.data.sft.html create mode 100644 docs/api/utils.dict.html create mode 100644 docs/api/utils.distributed.html create mode 100644 docs/api/utils.freeze.html create mode 100644 docs/api/utils.gradient_checkpointing.unsloth.html create mode 100644 docs/api/utils.lora.html create mode 100644 docs/api/utils.lora_embeddings.html create mode 100644 docs/api/utils.model_shard_quant.html create mode 100644 docs/api/utils.models.html create mode 100644 docs/api/utils.optimizers.adopt.html create mode 100644 docs/api/utils.samplers.multipack.html create mode 100644 docs/api/utils.schedulers.html create mode 100644 docs/api/utils.schemas.config.html create mode 100644 docs/api/utils.schemas.datasets.html create mode 100644 docs/api/utils.schemas.enums.html create mode 100644 docs/api/utils.schemas.integrations.html create mode 100644 docs/api/utils.schemas.model.html create mode 100644 docs/api/utils.schemas.peft.html create mode 100644 docs/api/utils.schemas.training.html create mode 100644 docs/api/utils.schemas.trl.html create mode 100644 docs/api/utils.schemas.utils.html create mode 100644 docs/api/utils.tokenization.html create mode 100644 docs/api/utils.trainer.html create mode 100644 docs/sequence_parallelism.html diff --git a/.nojekyll b/.nojekyll index 0b18537b2..310a4948e 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -fc509c68 \ No newline at end of file +d475bb60 \ No newline at end of file diff --git a/FAQS.html b/FAQS.html index 2b056b155..5740e0f93 100644 --- a/FAQS.html +++ b/FAQS.html @@ -143,7 +143,7 @@ ul.task-list li input[type="checkbox"] { + @@ -407,7 +413,9 @@ ul.task-list li input[type="checkbox"] { diff --git a/TODO.html b/TODO.html index e93f0ca95..a9f4a1825 100644 --- a/TODO.html +++ b/TODO.html @@ -143,7 +143,7 @@ ul.task-list li input[type="checkbox"] { + diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html index dc65736c8..c91489cc2 100644 --- a/docs/amd_hpc.html +++ b/docs/amd_hpc.html @@ -178,7 +178,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin + @@ -518,7 +524,9 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

7. Apply xformers Workaround

-

xformers appears to be incompatible with ROCm. Apply the following workarounds: - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers. - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.

+

xformers appears to be incompatible with ROCm. Apply the following workarounds: +- Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers. +- Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.

8. Prepare Job Submission Script

diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html new file mode 100644 index 000000000..17872efda --- /dev/null +++ b/docs/api/cli.args.html @@ -0,0 +1,957 @@ + + + + + + + + + +cli.args – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.args

+

cli.args

+

Module for axolotl CLI command arguments.

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
EvaluateCliArgsDataclass with CLI arguments for axolotl evaluate command.
InferenceCliArgsDataclass with CLI arguments for axolotl inference command.
PreprocessCliArgsDataclass with CLI arguments for axolotl preprocess command.
TrainerCliArgsDataclass with CLI arguments for axolotl train command.
+
+

EvaluateCliArgs

+
cli.args.EvaluateCliArgs(
+    self,
+    debug=False,
+    debug_text_only=False,
+    debug_num_examples=0,
+)
+

Dataclass with CLI arguments for axolotl evaluate command.

+
+
+

InferenceCliArgs

+
cli.args.InferenceCliArgs(self, prompter=None)
+

Dataclass with CLI arguments for axolotl inference command.

+
+
+

PreprocessCliArgs

+
cli.args.PreprocessCliArgs(
+    self,
+    debug=False,
+    debug_text_only=False,
+    debug_num_examples=1,
+    prompter=None,
+    download=True,
+    iterable=None,
+)
+

Dataclass with CLI arguments for axolotl preprocess command.

+
+
+

TrainerCliArgs

+
cli.args.TrainerCliArgs(
+    self,
+    debug=False,
+    debug_text_only=False,
+    debug_num_examples=0,
+    merge_lora=False,
+    prompter=None,
+    shard=False,
+    main_process_port=None,
+    num_processes=None,
+)
+

Dataclass with CLI arguments for axolotl train command.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.checks.html b/docs/api/cli.checks.html new file mode 100644 index 000000000..040c32ab3 --- /dev/null +++ b/docs/api/cli.checks.html @@ -0,0 +1,962 @@ + + + + + + + + + +cli.checks – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.checks

+

cli.checks

+

Various checks for Axolotl CLI.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
check_accelerate_default_configLogs at warning level if no accelerate config file is found.
check_user_tokenChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.
+
+

check_accelerate_default_config

+
cli.checks.check_accelerate_default_config()
+

Logs at warning level if no accelerate config file is found.

+
+
+

check_user_token

+
cli.checks.check_user_token()
+

Checks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.

+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
boolBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
LocalTokenNotFoundErrorIf HF user info can’t be retrieved.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.cloud.base.html b/docs/api/cli.cloud.base.html new file mode 100644 index 000000000..6ce51a4c9 --- /dev/null +++ b/docs/api/cli.cloud.base.html @@ -0,0 +1,904 @@ + + + + + + + + + +cli.cloud.base – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.cloud.base

+

cli.cloud.base

+

base class for cloud platforms from cli

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
CloudAbstract base class for cloud platforms.
+
+

Cloud

+
cli.cloud.base.Cloud()
+

Abstract base class for cloud platforms.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.cloud.modal_.html b/docs/api/cli.cloud.modal_.html new file mode 100644 index 000000000..5d2e05238 --- /dev/null +++ b/docs/api/cli.cloud.modal_.html @@ -0,0 +1,930 @@ + + + + + + + + + +cli.cloud.modal_ – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.cloud.modal_

+

cli.cloud.modal_

+

Modal Cloud support from CLI

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
ModalCloudModal Cloud implementation.
+
+

ModalCloud

+
cli.cloud.modal_.ModalCloud(self, config, app=None)
+

Modal Cloud implementation.

+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
run_cmdRun a command inside a folder, with Modal Volume reloading before and commit on success.
+
+

run_cmd

+
cli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)
+

Run a command inside a folder, with Modal Volume reloading before and commit on success.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.config.html b/docs/api/cli.config.html new file mode 100644 index 000000000..982879d64 --- /dev/null +++ b/docs/api/cli.config.html @@ -0,0 +1,1193 @@ + + + + + + + + + +cli.config – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.config

+

cli.config

+

Configuration loading and processing.

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
check_remote_configFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query
choose_configHelper method for choosing a axolotl config YAML file (considering only files
load_cfgLoads the axolotl configuration stored at config, validates it, and performs
prepare_pluginsRegisters the plugins for the given configuration.
+
+

check_remote_config

+
cli.config.check_remote_config(config)
+

First, determines if the passed config is a valid HTTPS URL. Then, attempts to query +for it and parse its content, first as JSON, then as YAML (YAML is preferred). +Finally, the parsed content is written to a local file and its path is returned.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[str, Path]HTTPS URL to a YAML or JSON file.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
Union[str, Path]Either the original config if it’s not a valid HTTPS URL, or the path to the
Union[str, Path]downloaded remote config.
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf the remote configuration is neither valid JSON or YAML.
RuntimeErrorIf some request-related exception occurs from the file download.
ExceptionCatch-all for any other exception.
+
+
+
+

choose_config

+
cli.config.choose_config(path)
+

Helper method for choosing a axolotl config YAML file (considering only files +ending with .yml or .yaml). If more than one config file exists in the passed +path, the user is prompted to choose one.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
pathPathDirectory in which config file(s) are stored.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
strPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,
strthe user-selected YAML file.
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf no YAML files are found in the given path.
+
+
+
+

load_cfg

+
cli.config.load_cfg(config=Path('examples/'), **kwargs)
+

Loads the axolotl configuration stored at config, validates it, and performs +various setup.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[str, Path]Path (local or remote) to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
DictDefaultDictDefault mapping configuration keys to values.
+
+
+
+

prepare_plugins

+
cli.config.prepare_plugins(cfg)
+

Registers the plugins for the given configuration.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.evaluate.html b/docs/api/cli.evaluate.html new file mode 100644 index 000000000..86f088b87 --- /dev/null +++ b/docs/api/cli.evaluate.html @@ -0,0 +1,982 @@ + + + + + + + + + +cli.evaluate – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.evaluate

+

cli.evaluate

+

CLI to run evaluation on a model.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
do_cliParses axolotl config, CLI args, and calls do_evaluate.
do_evaluateEvaluates a transformers model by first loading the dataset(s) specified in the
+
+

do_cli

+
cli.evaluate.do_cli(config=Path('examples/'), **kwargs)
+

Parses axolotl config, CLI args, and calls do_evaluate.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[Path, str]Path to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+
+

do_evaluate

+
cli.evaluate.do_evaluate(cfg, cli_args)
+

Evaluates a transformers model by first loading the dataset(s) specified in the +axolotl config, and then calling axolotl.evaluate.evaluate, which computes +evaluation metrics on the given dataset(s) and writes them to disk.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsTrainerCliArgsCLI arguments.required
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.inference.html b/docs/api/cli.inference.html new file mode 100644 index 000000000..2dbab4262 --- /dev/null +++ b/docs/api/cli.inference.html @@ -0,0 +1,1061 @@ + + + + + + + + + +cli.inference – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.inference

+

cli.inference

+

CLI to run inference on a trained model.

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
do_cliParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.
do_inferenceRuns inference on the command line in a loop. User input is accepted, a chat template
do_inference_gradioRuns inference in a Gradio interface. User input is accepted, a chat template is
get_multi_line_inputGets multi-line input from terminal.
+
+

do_cli

+
cli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)
+

Parses axolotl config, CLI args, and calls do_inference or do_inference_gradio.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[Path, str]Path to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+
+

do_inference

+
cli.inference.do_inference(cfg, cli_args)
+

Runs inference on the command line in a loop. User input is accepted, a chat template +is (optionally) applied, and the model specified in the axolotl config is used to +generate completions according to a default generation config.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsInferenceCliArgsInference-specific CLI arguments.required
+
+
+
+

do_inference_gradio

+
cli.inference.do_inference_gradio(cfg, cli_args)
+

Runs inference in a Gradio interface. User input is accepted, a chat template is +(optionally) applied, and the model specified in the axolotl config is used to +generate completions according to a default generation config.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsInferenceCliArgsInference-specific CLI arguments.required
+
+
+
+

get_multi_line_input

+
cli.inference.get_multi_line_input()
+

Gets multi-line input from terminal.

+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
strPossibly multi-line, possibly empty stdin input as a string.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.main.html b/docs/api/cli.main.html new file mode 100644 index 000000000..ba47e7343 --- /dev/null +++ b/docs/api/cli.main.html @@ -0,0 +1,1256 @@ + + + + + + + + + +cli.main – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.main

+

cli.main

+

Click CLI definitions for various axolotl commands.

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
cliAxolotl CLI - Train and fine-tune large language models
evaluateEvaluate a model.
fetchFetch example configs or other resources.
inferenceRun inference with a trained model.
merge_loraMerge trained LoRA adapters into a base model.
merge_sharded_fsdp_weightsMerge sharded FSDP model weights.
preprocessPreprocess datasets before training.
trainTrain or fine-tune a model.
+
+

cli

+
cli.main.cli()
+

Axolotl CLI - Train and fine-tune large language models

+
+
+

evaluate

+
cli.main.evaluate(config, accelerate, **kwargs)
+

Evaluate a model.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configstrPath to axolotl config YAML file.required
accelerateboolWhether to use accelerate launcher.required
kwargsAdditional keyword arguments which correspond to CLI args or axolotl config options.{}
+
+
+
+

fetch

+
cli.main.fetch(directory, dest)
+

Fetch example configs or other resources.

+

Available directories: +- examples: Example configuration files +- deepspeed_configs: DeepSpeed configuration files

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
directorystrOne of examples, deepspeed_configs.required
destOptional[str]Optional destination directory.required
+
+
+
+

inference

+
cli.main.inference(config, accelerate, gradio, **kwargs)
+

Run inference with a trained model.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configstrPath to axolotl config YAML file.required
accelerateboolWhether to use accelerate launcher.required
gradioboolWhether to use Gradio browser interface or command line for inference.required
kwargsAdditional keyword arguments which correspond to CLI args or axolotl config options.{}
+
+
+
+

merge_lora

+
cli.main.merge_lora(config, **kwargs)
+

Merge trained LoRA adapters into a base model.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configstrPath to axolotl config YAML file.required
kwargsAdditional keyword arguments which correspond to CLI args or axolotl config options.{}
+
+
+
+

merge_sharded_fsdp_weights

+
cli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)
+

Merge sharded FSDP model weights.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configstrPath to axolotl config YAML file.required
accelerateboolWhether to use accelerate launcher.required
kwargsAdditional keyword arguments which correspond to CLI args or axolotl config options.{}
+
+
+
+

preprocess

+
cli.main.preprocess(config, cloud=None, **kwargs)
+

Preprocess datasets before training.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configstrPath to axolotl config YAML file.required
cloudOptional[str]Path to a cloud accelerator configuration file.None
kwargsAdditional keyword arguments which correspond to CLI args or axolotl config options.{}
+
+
+
+

train

+
cli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)
+

Train or fine-tune a model.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configstrPath to axolotl config YAML file.required
accelerateboolWhether to use accelerate launcher.required
cloudOptional[str]Path to a cloud accelerator configuration fileNone
sweepOptional[str]Path to YAML config for sweeping hyperparameters.None
kwargsAdditional keyword arguments which correspond to CLI args or axolotl config options.{}
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html new file mode 100644 index 000000000..822dc0a15 --- /dev/null +++ b/docs/api/cli.merge_lora.html @@ -0,0 +1,1001 @@ + + + + + + + + + +cli.merge_lora – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.merge_lora

+

cli.merge_lora

+

CLI to merge a trained LoRA into a base model.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
do_cliParses axolotl config, CLI args, and calls do_merge_lora. Note that various
do_merge_loraCalls transformersmerge_and_unload on the model given in the axolotl config
+
+

do_cli

+
cli.merge_lora.do_cli(config=Path('examples/'), **kwargs)
+

Parses axolotl config, CLI args, and calls do_merge_lora. Note that various +config values will be overwritten to allow the LoRA merge logic to work as expected +(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[Path, str]Path to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf target directory for LoRA merged model does not exist.
+
+
+
+

do_merge_lora

+
cli.merge_lora.do_merge_lora(cfg)
+

Calls transformersmerge_and_unload on the model given in the axolotl config +along with the LoRA adapters to combine them into a single base model.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.merge_sharded_fsdp_weights.html b/docs/api/cli.merge_sharded_fsdp_weights.html new file mode 100644 index 000000000..3eed28f14 --- /dev/null +++ b/docs/api/cli.merge_sharded_fsdp_weights.html @@ -0,0 +1,1050 @@ + + + + + + + + + +cli.merge_sharded_fsdp_weights – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.merge_sharded_fsdp_weights

+

cli.merge_sharded_fsdp_weights

+

CLI to merge sharded FSDP model checkpoints into a single combined checkpoint.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
BFloat16CastPlannerA custom planner to cast tensors to bfloat16 on the fly during loading.
+
+

BFloat16CastPlanner

+
cli.merge_sharded_fsdp_weights.BFloat16CastPlanner()
+

A custom planner to cast tensors to bfloat16 on the fly during loading.

+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
do_cliParses axolotl config, CLI args, and calls merge_fsdp_weights.
merge_fsdp_weightsMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
+
+

do_cli

+
cli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)
+

Parses axolotl config, CLI args, and calls merge_fsdp_weights.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[Path, str]Path to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+
+

merge_fsdp_weights

+
cli.merge_sharded_fsdp_weights.merge_fsdp_weights(
+    checkpoint_dir,
+    output_path,
+    safe_serialization=False,
+    remove_checkpoint_dir=False,
+)
+

Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if +SHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if +safe_serialization else pytorch_model.bin.

+

Note: this is a CPU-bound process.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
checkpoint_dirstrThe directory containing the FSDP checkpoints (can be either the model or optimizer).required
output_pathstrThe path to save the merged checkpoint.required
safe_serializationbool, optional, defaults to TrueWhether to save the merged weights with safetensors (recommended).False
remove_checkpoint_dirbool, optional, defaults to FalseWhether to remove the checkpoint directory after merging.False
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf torch version < 2.3.0, or if checkpoint_dir does not exist.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.preprocess.html b/docs/api/cli.preprocess.html new file mode 100644 index 000000000..41491a6e7 --- /dev/null +++ b/docs/api/cli.preprocess.html @@ -0,0 +1,980 @@ + + + + + + + + + +cli.preprocess – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.preprocess

+

cli.preprocess

+

CLI to run preprocessing of a dataset.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
do_cliParses axolotl config, CLI args, and calls do_preprocess.
do_preprocessPreprocesses dataset specified in axolotl config.
+
+

do_cli

+
cli.preprocess.do_cli(config=Path('examples/'), **kwargs)
+

Parses axolotl config, CLI args, and calls do_preprocess.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[Path, str]Path to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+
+

do_preprocess

+
cli.preprocess.do_preprocess(cfg, cli_args)
+

Preprocesses dataset specified in axolotl config.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsPreprocessCliArgsPreprocessing-specific CLI arguments.required
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.sweeps.html b/docs/api/cli.sweeps.html new file mode 100644 index 000000000..e19af7095 --- /dev/null +++ b/docs/api/cli.sweeps.html @@ -0,0 +1,971 @@ + + + + + + + + + +cli.sweeps – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.sweeps

+

cli.sweeps

+

Utilities for handling sweeps over configs for axolotl train CLI command

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
generate_sweep_configsRecursively generates all possible configurations by applying sweeps to the base config.
+
+

generate_sweep_configs

+
cli.sweeps.generate_sweep_configs(base_config, sweeps_config)
+

Recursively generates all possible configurations by applying sweeps to the base config.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
base_configdictThe original configuration dictionaryrequired
sweeps_configdictDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ keyrequired
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
listlist[dict[str, list]]List of all possible configuration dictionaries
+
+
+

Example

+

sweeps_config = { +‘learning_rate’: [0.1, 0.01], +’_’: [ +{‘load_in_8bit’: True, ‘adapter’: ‘lora’}, +{‘load_in_4bit’: True, ‘adapter’: ‘qlora’} +] +}

+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.train.html b/docs/api/cli.train.html new file mode 100644 index 000000000..d12b1fab3 --- /dev/null +++ b/docs/api/cli.train.html @@ -0,0 +1,982 @@ + + + + + + + + + +cli.train – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.train

+

cli.train

+

CLI to run training on a model.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
do_cliParses axolotl config, CLI args, and calls do_train.
do_trainTrains a transformers model by first loading the dataset(s) specified in the
+
+

do_cli

+
cli.train.do_cli(config=Path('examples/'), **kwargs)
+

Parses axolotl config, CLI args, and calls do_train.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
configUnion[Path, str]Path to axolotl config YAML file.Path('examples/')
kwargsAdditional keyword arguments to override config file values.{}
+
+
+
+

do_train

+
cli.train.do_train(cfg, cli_args)
+

Trains a transformers model by first loading the dataset(s) specified in the +axolotl config, and then calling axolotl.train.train. Also runs the plugin +manager’s post_train_unload once training completes.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsTrainerCliArgsTraining-specific CLI arguments.required
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/cli.utils.html b/docs/api/cli.utils.html new file mode 100644 index 000000000..0d6a60d35 --- /dev/null +++ b/docs/api/cli.utils.html @@ -0,0 +1,1376 @@ + + + + + + + + + +cli.utils – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

cli.utils

+

cli.utils

+

Utility methods for axolotl CLI.

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
add_options_from_configCreate Click options from the fields of a Pydantic model.
add_options_from_dataclassCreate Click options from the fields of a dataclass.
build_commandBuild command list from base command and options.
download_fileDownload a single file and return its processing status.
fetch_from_githubSync files from a specific directory in the GitHub repository.
filter_none_kwargsWraps function to remove None-valued kwargs.
load_model_and_tokenizerHelper function for loading a model and tokenizer specified in the given axolotl
strip_optional_typeExtracts the non-None type from an Optional / Union type.
+
+

add_options_from_config

+
cli.utils.add_options_from_config(config_class)
+

Create Click options from the fields of a Pydantic model.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
config_classType[BaseModel]PyDantic model with fields to parse from the CLIrequired
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
CallableFunction decorator for Axolotl CLI command.
+
+
+
+

add_options_from_dataclass

+
cli.utils.add_options_from_dataclass(config_class)
+

Create Click options from the fields of a dataclass.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
config_classType[Any]Dataclass with fields to parse from the CLI.required
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
CallableFunction decorator for Axolotl CLI command.
+
+
+
+

build_command

+
cli.utils.build_command(base_cmd, options)
+

Build command list from base command and options.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
base_cmdlist[str]Command without options.required
optionsdict[str, Any]Options to parse and append to base command.required
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
list[str]List of strings giving shell command.
+
+
+
+

download_file

+
cli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)
+

Download a single file and return its processing status.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
file_infotupleTuple of (file_path, remote_sha).required
raw_base_urlstrBase URL for raw GitHub content.required
dest_pathPathLocal destination directory.required
dir_prefixstrDirectory prefix to filter files.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[str, str]Tuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.
+
+
+
+

fetch_from_github

+
cli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)
+

Sync files from a specific directory in the GitHub repository. +Only downloads files that don’t exist locally or have changed.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
dir_prefixstrDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).required
dest_dirstr | NoneLocal destination directory.None
max_workersintMaximum number of concurrent downloads.5
+
+
+
+

filter_none_kwargs

+
cli.utils.filter_none_kwargs(func)
+

Wraps function to remove None-valued kwargs.

+
+

Parameters

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
funcCallableFunction to wrap.required
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
CallableWrapped function.
+
+
+
+

load_model_and_tokenizer

+
cli.utils.load_model_and_tokenizer(cfg, inference=False)
+

Helper function for loading a model and tokenizer specified in the given axolotl +config.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
inferenceboolBoolean denoting inference mode.False
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any]transformers model and tokenizer.
+
+
+
+

strip_optional_type

+
cli.utils.strip_optional_type(field_type)
+

Extracts the non-None type from an Optional / Union type.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
field_typetype | str | NoneType of field for Axolotl CLI command.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
If the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/common.architectures.html b/docs/api/common.architectures.html new file mode 100644 index 000000000..d6cc08341 --- /dev/null +++ b/docs/api/common.architectures.html @@ -0,0 +1,842 @@ + + + + + + + + + +common.architectures – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

common.architectures

+

common.architectures

+

Common architecture specific constants

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/common.const.html b/docs/api/common.const.html new file mode 100644 index 000000000..f603bff11 --- /dev/null +++ b/docs/api/common.const.html @@ -0,0 +1,842 @@ + + + + + + + + + +common.const – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

common.const

+

common.const

+

Various shared constants

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/common.datasets.html b/docs/api/common.datasets.html new file mode 100644 index 000000000..a5428ecd4 --- /dev/null +++ b/docs/api/common.datasets.html @@ -0,0 +1,1133 @@ + + + + + + + + + +common.datasets – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

common.datasets

+

common.datasets

+

Dataset loading utilities.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
TrainDatasetMetaDataclass with fields for training and validation datasets and metadata.
+
+

TrainDatasetMeta

+
common.datasets.TrainDatasetMeta(
+    self,
+    train_dataset,
+    eval_dataset=None,
+    total_num_steps=None,
+)
+

Dataclass with fields for training and validation datasets and metadata.

+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
load_datasetsLoads one or more training or evaluation datasets, calling
load_preference_datasetsLoads one or more training or evaluation datasets for RL training using paired
sample_datasetRandomly sample num_samples samples from dataset.
+
+

load_datasets

+
common.datasets.load_datasets(cfg, cli_args)
+

Loads one or more training or evaluation datasets, calling +axolotl.utils.data.prepare_dataset. Optionally, logs out debug information.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsUnion[PreprocessCliArgs, TrainerCliArgs]Command-specific CLI arguments.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
TrainDatasetMetaDataclass with fields for training and evaluation datasets and the computed
TrainDatasetMetatotal_num_steps.
+
+
+
+

load_preference_datasets

+
common.datasets.load_preference_datasets(cfg, cli_args)
+

Loads one or more training or evaluation datasets for RL training using paired +preference data, calling axolotl.utils.data.rl.load_prepare_preference_datasets. +Optionally, logs out debug information.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
cli_argsUnion[PreprocessCliArgs, TrainerCliArgs]Command-specific CLI arguments.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
TrainDatasetMetaDataclass with fields for training and evaluation datasets and the computed
TrainDatasetMetatotal_num_steps.
+
+
+
+

sample_dataset

+
common.datasets.sample_dataset(dataset, num_samples)
+

Randomly sample num_samples samples from dataset.

+
+

Parameters

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
datasetDatasetDataset.required
num_samplesintNumber of samples to return.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
DatasetRandom sample (with replacement) of examples in dataset.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/convert.html b/docs/api/convert.html new file mode 100644 index 000000000..2f4200653 --- /dev/null +++ b/docs/api/convert.html @@ -0,0 +1,960 @@ + + + + + + + + + +convert – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

convert

+

convert

+

Module containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
FileReaderReads a file and returns its contents as a string
FileWriterWrites a string to a file
JsonParserParses a string as JSON and returns the result
JsonToJsonlConverterConverts a JSON file to JSONL
JsonlSerializerSerializes a list of JSON objects into a JSONL string
StdoutWriterWrites a string to stdout
+
+

FileReader

+
convert.FileReader()
+

Reads a file and returns its contents as a string

+
+
+

FileWriter

+
convert.FileWriter(self, file_path)
+

Writes a string to a file

+
+
+

JsonParser

+
convert.JsonParser()
+

Parses a string as JSON and returns the result

+
+
+

JsonToJsonlConverter

+
convert.JsonToJsonlConverter(
+    self,
+    file_reader,
+    file_writer,
+    json_parser,
+    jsonl_serializer,
+)
+

Converts a JSON file to JSONL

+
+
+

JsonlSerializer

+
convert.JsonlSerializer()
+

Serializes a list of JSON objects into a JSONL string

+
+
+

StdoutWriter

+
convert.StdoutWriter()
+

Writes a string to stdout

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.chat.format.chatml.html b/docs/api/core.chat.format.chatml.html new file mode 100644 index 000000000..1ea8ff2d3 --- /dev/null +++ b/docs/api/core.chat.format.chatml.html @@ -0,0 +1,842 @@ + + + + + + + + + +core.chat.format.chatml – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.chat.format.chatml

+

core.chat.format.chatml

+

ChatML transformation functions for MessageContents

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.chat.format.llama3x.html b/docs/api/core.chat.format.llama3x.html new file mode 100644 index 000000000..2d7db1ad6 --- /dev/null +++ b/docs/api/core.chat.format.llama3x.html @@ -0,0 +1,842 @@ + + + + + + + + + +core.chat.format.llama3x – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.chat.format.llama3x

+

core.chat.format.llama3x

+

Llama 3.x chat formatting functions for MessageContents

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.chat.format.shared.html b/docs/api/core.chat.format.shared.html new file mode 100644 index 000000000..6cac7670e --- /dev/null +++ b/docs/api/core.chat.format.shared.html @@ -0,0 +1,842 @@ + + + + + + + + + +core.chat.format.shared – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.chat.format.shared

+

core.chat.format.shared

+

shared functions for format transforms

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.chat.messages.html b/docs/api/core.chat.messages.html new file mode 100644 index 000000000..237e42121 --- /dev/null +++ b/docs/api/core.chat.messages.html @@ -0,0 +1,1014 @@ + + + + + + + + + +core.chat.messages – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.chat.messages

+

core.chat.messages

+

internal message representations of chat messages

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
ChatFormattedChatsChat formatted chats with formatter and optional train on inputs
Chatstop level data structure for chat conversations
MessageContentTypesMessage content types for text, image, audio, tool calls, and tool responses
MessageContentsMessage contents with type, value, metadata, weight, newline, and end of contents
MessageRolesMessage roles for the system, user, assistant, and tools
MessagesMessages with role, content, metadata, weight, and chat formatting
PreferenceChatsrepresentation for preference data for chat
SpecialTokenSpecial tokens for beginning of string and end of string
ToolTool with description, function, and parameters
ToolCallContentsTool call contents with name, arguments, and optional id
ToolCallFunctionTool call function with name and arguments
ToolResponseContentsTool response contents with name, content, and optional id
+
+

ChatFormattedChats

+
core.chat.messages.ChatFormattedChats()
+

Chat formatted chats with formatter and optional train on inputs

+
+
+

Chats

+
core.chat.messages.Chats()
+

top level data structure for chat conversations

+
+
+

MessageContentTypes

+
core.chat.messages.MessageContentTypes()
+

Message content types for text, image, audio, tool calls, and tool responses

+
+
+

MessageContents

+
core.chat.messages.MessageContents()
+

Message contents with type, value, metadata, weight, newline, and end of contents

+
+
+

MessageRoles

+
core.chat.messages.MessageRoles()
+

Message roles for the system, user, assistant, and tools

+
+
+

Messages

+
core.chat.messages.Messages()
+

Messages with role, content, metadata, weight, and chat formatting

+
+
+

PreferenceChats

+
core.chat.messages.PreferenceChats()
+

representation for preference data for chat

+
+
+

SpecialToken

+
core.chat.messages.SpecialToken()
+

Special tokens for beginning of string and end of string

+
+
+

Tool

+
core.chat.messages.Tool()
+

Tool with description, function, and parameters

+
+
+

ToolCallContents

+
core.chat.messages.ToolCallContents()
+

Tool call contents with name, arguments, and optional id

+
+
+

ToolCallFunction

+
core.chat.messages.ToolCallFunction()
+

Tool call function with name and arguments

+
+
+

ToolResponseContents

+
core.chat.messages.ToolResponseContents()
+

Tool response contents with name, content, and optional id

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.datasets.chat.html b/docs/api/core.datasets.chat.html new file mode 100644 index 000000000..19e2d8410 --- /dev/null +++ b/docs/api/core.datasets.chat.html @@ -0,0 +1,914 @@ + + + + + + + + + +core.datasets.chat – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.datasets.chat

+

core.datasets.chat

+

chat dataset module

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
TokenizedChatDatasetTokenized chat dataset
+
+

TokenizedChatDataset

+
core.datasets.chat.TokenizedChatDataset(
+    self,
+    data,
+    model_transform,
+    *args,
+    message_transform=None,
+    formatter=None,
+    process_count=None,
+    keep_in_memory=False,
+    **kwargs,
+)
+

Tokenized chat dataset

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html new file mode 100644 index 000000000..282256361 --- /dev/null +++ b/docs/api/core.datasets.transforms.chat_builder.html @@ -0,0 +1,985 @@ + + + + + + + + + +core.datasets.transforms.chat_builder – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.datasets.transforms.chat_builder

+

core.datasets.transforms.chat_builder

+

This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
chat_message_transform_builderBuilds a transform that takes a row from the dataset and converts it to a Chat
+
+

chat_message_transform_builder

+
core.datasets.transforms.chat_builder.chat_message_transform_builder(
+    train_on_inputs=False,
+    conversations_field='conversations',
+    message_field_role=['role', 'from'],
+    message_field_content=['value', 'text', 'content'],
+    message_field_training=['train', 'weight'],
+)
+

Builds a transform that takes a row from the dataset and converts it to a Chat

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
train_on_inputsboolIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.False
conversations_fieldstrThe field name of the conversations. Defaults to “conversations”.'conversations'
message_field_rolestr | list[str]The field name of the role. Defaults to “role”.['role', 'from']
message_field_contentstr | list[str]The field name of the message content. Defaults to “content”.['value', 'text', 'content']
message_field_trainingstr | list[str]The field name of the train/weight. Defaults to “weight”.['train', 'weight']
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
CallableA function that takes a list of conversations and returns a list of messages.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.trainer_builder.html b/docs/api/core.trainer_builder.html new file mode 100644 index 000000000..007be2f6e --- /dev/null +++ b/docs/api/core.trainer_builder.html @@ -0,0 +1,983 @@ + + + + + + + + + +core.trainer_builder – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.trainer_builder

+

core.trainer_builder

+

Builder for the training args and trainer

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
HFCausalTrainerBuilderBuild the HuggingFace training args/trainer for causal models and reward modeling
HFPPOTrainerBuilderHF Factory class for PPO Trainer
HFRLTrainerBuilderTrainer factory class for TRL-based RLHF trainers (e.g. DPO)
TrainerBuilderBaseBase class for trainer builder.
+
+

HFCausalTrainerBuilder

+
core.trainer_builder.HFCausalTrainerBuilder(
+    self,
+    cfg,
+    model,
+    tokenizer,
+    processor=None,
+)
+

Build the HuggingFace training args/trainer for causal models and reward modeling +using TRL.

+
+
+

HFPPOTrainerBuilder

+
core.trainer_builder.HFPPOTrainerBuilder(
+    self,
+    cfg,
+    model,
+    tokenizer,
+    processor=None,
+)
+

HF Factory class for PPO Trainer

+
+
+

HFRLTrainerBuilder

+
core.trainer_builder.HFRLTrainerBuilder(
+    self,
+    cfg,
+    model,
+    tokenizer,
+    processor=None,
+)
+

Trainer factory class for TRL-based RLHF trainers (e.g. DPO)

+
+
+

TrainerBuilderBase

+
core.trainer_builder.TrainerBuilderBase(
+    self,
+    cfg,
+    model,
+    tokenizer,
+    processor=None,
+)
+

Base class for trainer builder.

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
get_post_trainer_create_callbacksCallbacks added after the trainer is created, usually b/c these need access to the trainer
+
+
get_post_trainer_create_callbacks
+
core.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(
+    trainer,
+)
+

Callbacks added after the trainer is created, usually b/c these need access to the trainer

+ + +
+
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.trainers.base.html b/docs/api/core.trainers.base.html new file mode 100644 index 000000000..2b30b1c2d --- /dev/null +++ b/docs/api/core.trainers.base.html @@ -0,0 +1,1042 @@ + + + + + + + + + +core.trainers.base – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.trainers.base

+

core.trainers.base

+

Module for customized trainers

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
AxolotlTrainerExtend the base Trainer for axolotl helpers
+
+

AxolotlTrainer

+
core.trainers.base.AxolotlTrainer(
+    self,
+    *_args,
+    bench_data_collator=None,
+    eval_data_collator=None,
+    dataset_tags=None,
+    **kwargs,
+)
+

Extend the base Trainer for axolotl helpers

+
+

Methods

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
get_eval_dataloaderGet dataloader for evaluation
get_train_dataloaderGet dataloader for training
logLog logs on the various objects watching training, including stored metrics.
push_to_hubOverwrite the push_to_hub method in order to force-add the tags when pushing the
training_stepPerform a training step on a batch of inputs. Overrides the
+
+
get_eval_dataloader
+
core.trainers.base.AxolotlTrainer.get_eval_dataloader(eval_dataset=None)
+

Get dataloader for evaluation

+
+
+
get_train_dataloader
+
core.trainers.base.AxolotlTrainer.get_train_dataloader()
+

Get dataloader for training

+
+
+
log
+
core.trainers.base.AxolotlTrainer.log(logs, start_time=None)
+

Log logs on the various objects watching training, including stored metrics.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
logsdict[str, float]The values to log.required
start_timefloat | NoneThe start of training.None
+
+
+
+
push_to_hub
+
core.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)
+

Overwrite the push_to_hub method in order to force-add the tags when pushing the +model on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.

+
+
+
training_step
+
core.trainers.base.AxolotlTrainer.training_step(
+    model,
+    inputs,
+    num_items_in_batch=None,
+)
+

Perform a training step on a batch of inputs. Overrides the +transformers.trainer.Trainer method to handle sequence parallelism if +enabled.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
modelnn.ModuleModel to perform training step for.required
inputsdict[str, torch.Tensor | Any]Dictionary mapping.required
+ + +
+
+
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.trainers.dpo.trainer.html b/docs/api/core.trainers.dpo.trainer.html new file mode 100644 index 000000000..a970e12f9 --- /dev/null +++ b/docs/api/core.trainers.dpo.trainer.html @@ -0,0 +1,932 @@ + + + + + + + + + +core.trainers.dpo.trainer – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.trainers.dpo.trainer

+

core.trainers.dpo.trainer

+

DPO trainer for axolotl

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
AxolotlDPOTrainerExtend the base DPOTrainer for axolotl helpers
+
+

AxolotlDPOTrainer

+
core.trainers.dpo.trainer.AxolotlDPOTrainer(
+    self,
+    *args,
+    dataset_tags=None,
+    **kwargs,
+)
+

Extend the base DPOTrainer for axolotl helpers

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
push_to_hubOverwrite the push_to_hub method in order to force-add the tags when pushing the
+
+
push_to_hub
+
core.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)
+

Overwrite the push_to_hub method in order to force-add the tags when pushing the +model on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.

+ + +
+
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.trainers.grpo.trainer.html b/docs/api/core.trainers.grpo.trainer.html new file mode 100644 index 000000000..03a77251a --- /dev/null +++ b/docs/api/core.trainers.grpo.trainer.html @@ -0,0 +1,904 @@ + + + + + + + + + +core.trainers.grpo.trainer – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.trainers.grpo.trainer

+

core.trainers.grpo.trainer

+

Axolotl GRPO trainer

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
AxolotlGRPOTrainerExtend the base GRPOTrainer for axolotl helpers
+
+

AxolotlGRPOTrainer

+
core.trainers.grpo.trainer.AxolotlGRPOTrainer(self, *args, **kwargs)
+

Extend the base GRPOTrainer for axolotl helpers

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.trainers.trl.html b/docs/api/core.trainers.trl.html new file mode 100644 index 000000000..93d147e86 --- /dev/null +++ b/docs/api/core.trainers.trl.html @@ -0,0 +1,954 @@ + + + + + + + + + +core.trainers.trl – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.trainers.trl

+

core.trainers.trl

+

Module for TRL PPO trainer

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
AxolotlCPOTrainerExtend the base CPOTrainer for axolotl helpers
AxolotlKTOTrainerExtend the base KTOTrainer for axolotl helpers
AxolotlORPOTrainerExtend the base ORPOTrainer for axolotl helpers
AxolotlPRMTrainerExtend the base trl.PRMTrainer for axolotl helpers
AxolotlRewardTrainerExtend the base RewardTrainer for axolotl helpers
TRLPPOTrainerWrapper for TRL PPO trainer to handle customizations
+
+

AxolotlCPOTrainer

+
core.trainers.trl.AxolotlCPOTrainer()
+

Extend the base CPOTrainer for axolotl helpers

+
+
+

AxolotlKTOTrainer

+
core.trainers.trl.AxolotlKTOTrainer()
+

Extend the base KTOTrainer for axolotl helpers

+
+
+

AxolotlORPOTrainer

+
core.trainers.trl.AxolotlORPOTrainer()
+

Extend the base ORPOTrainer for axolotl helpers

+
+
+

AxolotlPRMTrainer

+
core.trainers.trl.AxolotlPRMTrainer()
+

Extend the base trl.PRMTrainer for axolotl helpers

+
+
+

AxolotlRewardTrainer

+
core.trainers.trl.AxolotlRewardTrainer()
+

Extend the base RewardTrainer for axolotl helpers

+
+
+

TRLPPOTrainer

+
core.trainers.trl.TRLPPOTrainer()
+

Wrapper for TRL PPO trainer to handle customizations

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html new file mode 100644 index 000000000..c12feb863 --- /dev/null +++ b/docs/api/core.training_args.html @@ -0,0 +1,1282 @@ + + + + + + + + + +core.training_args – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

core.training_args

+

core.training_args

+

extra axolotl specific training args

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
AxolotlCPOConfigCPO config for CPO training
AxolotlKTOConfigKTO config for KTO training
AxolotlORPOConfigORPO config for ORPO training
AxolotlPRMConfigPRM config for PRM training
AxolotlRewardConfigReward config for Reward training
AxolotlTrainingArgumentsTraining arguments for Causal trainer
AxolotlTrainingMixinsMixin class for the Axolotl training args.
+
+

AxolotlCPOConfig

+
core.training_args.AxolotlCPOConfig(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+    simpo_gamma=None,
+)
+

CPO config for CPO training

+
+
+

AxolotlKTOConfig

+
core.training_args.AxolotlKTOConfig(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+)
+

KTO config for KTO training

+
+
+

AxolotlORPOConfig

+
core.training_args.AxolotlORPOConfig(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+)
+

ORPO config for ORPO training

+
+
+

AxolotlPRMConfig

+
core.training_args.AxolotlPRMConfig(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+)
+

PRM config for PRM training

+
+
+

AxolotlRewardConfig

+
core.training_args.AxolotlRewardConfig(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+)
+

Reward config for Reward training

+
+
+

AxolotlTrainingArguments

+
core.training_args.AxolotlTrainingArguments(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+)
+

Training arguments for Causal trainer

+

This code is duplicated due to HF TrainingArguments not setting output_dir with a +default value so it can’t be used as a mixin.

+
+
+

AxolotlTrainingMixins

+
core.training_args.AxolotlTrainingMixins(
+    self,
+    model_type=None,
+    lr_quadratic_warmup=False,
+    pretraining=False,
+    sample_packing=False,
+    multipack_real_batches=False,
+    eval_sample_packing=None,
+    sample_packing_efficiency=1.0,
+    sample_packing_bin_size=200,
+    sample_packing_group_size=100000,
+    max_seq_length=2048,
+    relora_steps=None,
+    relora_warmup_steps=None,
+    relora_anneal_steps=None,
+    relora_prune_ratio=0.9,
+    bench_split='eval',
+    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',
+    do_bench_eval=False,
+    do_causal_lm_eval=False,
+    max_bench_samples=None,
+    bench_source_max_len=2048,
+    dataloader_prefetch_factor=None,
+    cosine_min_lr_ratio=None,
+    cosine_constant_lr_ratio=None,
+    loraplus_lr_ratio=None,
+    loraplus_lr_embedding=1e-06,
+    embedding_lr_scale=None,
+    lr_groups=None,
+    embedding_lr=None,
+    qlora=False,
+    orpo_alpha=None,
+    lisa_n_layers=None,
+    lisa_step_interval=None,
+    lisa_layers_attribute=None,
+    curriculum_sampling=None,
+    alternate_optimizer=None,
+    alternate_lr_scheduler_type=None,
+    chat_template=None,
+    kd_ce_alpha=None,
+    kd_alpha=1.0,
+    kd_temperature=1.0,
+    kd_zscore_base_temp=None,
+    kd_top_k_before_softmax=None,
+    sequence_parallel_degree=1,
+)
+

Mixin class for the Axolotl training args.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/datasets.html b/docs/api/datasets.html new file mode 100644 index 000000000..1c93cb4ea --- /dev/null +++ b/docs/api/datasets.html @@ -0,0 +1,930 @@ + + + + + + + + + +datasets – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

datasets

+

datasets

+

Module containing Dataset functionality

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
ConstantLengthDatasetIterable dataset that returns constant length chunks of tokens from stream of text files.
TokenizedPromptDatasetDataset that returns tokenized prompts from a stream of text files.
+
+

ConstantLengthDataset

+
datasets.ConstantLengthDataset(self, tokenizer, datasets, seq_length=2048)
+

Iterable dataset that returns constant length chunks of tokens from stream of text files. +Args: +tokenizer (Tokenizer): The processor used for processing the data. +dataset (dataset.Dataset): Dataset with text files. +seq_length (int): Length of token sequences to return.

+
+
+

TokenizedPromptDataset

+
datasets.TokenizedPromptDataset(
+    self,
+    prompt_tokenizer,
+    dataset,
+    process_count=None,
+    keep_in_memory=False,
+    **kwargs,
+)
+

Dataset that returns tokenized prompts from a stream of text files. +Args: +prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data. +dataset (dataset.Dataset): Dataset with text files. +process_count (int): Number of processes to use for tokenizing. +keep_in_memory (bool): Whether to keep the tokenized dataset in memory.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html new file mode 100644 index 000000000..de546c9c8 --- /dev/null +++ b/docs/api/evaluate.html @@ -0,0 +1,1040 @@ + + + + + + + + + +evaluate – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

evaluate

+

evaluate

+

Module for evaluating models.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
evaluateEvaluate a model on training and validation datasets.
evaluate_datasetHelper function to evaluate a single dataset.
+
+

evaluate

+
evaluate.evaluate(cfg, dataset_meta)
+

Evaluate a model on training and validation datasets.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
dataset_metaTrainDatasetMetaDataset metadata containing training and evaluation datasets.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
Dict[str, float]Dictionary mapping metric names to their values.
+
+
+
+

evaluate_dataset

+
evaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)
+

Helper function to evaluate a single dataset.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
trainerTrainerThe trainer instance.required
datasetDatasetDataset to evaluate.required
dataset_typestrType of dataset (‘train’ or ‘eval’).required
flash_optimumboolWhether to use flash optimum.False
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
Optional[Dict[str, float]]Dictionary of metrics or None if dataset is None.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/index.html b/docs/api/index.html new file mode 100644 index 000000000..24d96c050 --- /dev/null +++ b/docs/api/index.html @@ -0,0 +1,1459 @@ + + + + + + + + + +index – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

API Reference

+
+

Core

+

Core functionality for training

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
trainPrepare and train a model on a dataset. Can also infer from a model or merge lora
evaluateModule for evaluating models.
datasetsModule containing Dataset functionality
convertModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes
prompt_tokenizersModule containing PromptTokenizingStrategy and Prompter classes
logging_configCommon logging module for axolotl
core.trainer_builderBuilder for the training args and trainer
core.training_argsextra axolotl specific training args
core.chat.messagesinternal message representations of chat messages
core.chat.format.chatmlChatML transformation functions for MessageContents
core.chat.format.llama3xLlama 3.x chat formatting functions for MessageContents
core.chat.format.sharedshared functions for format transforms
core.datasets.chatchat dataset module
core.datasets.transforms.chat_builderThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
+
+
+

CLI

+

Command-line interface

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cli.mainClick CLI definitions for various axolotl commands.
cli.trainCLI to run training on a model.
cli.evaluateCLI to run evaluation on a model.
cli.argsModule for axolotl CLI command arguments.
cli.checksVarious checks for Axolotl CLI.
cli.configConfiguration loading and processing.
cli.inferenceCLI to run inference on a trained model.
cli.merge_loraCLI to merge a trained LoRA into a base model.
cli.merge_sharded_fsdp_weightsCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.
cli.preprocessCLI to run preprocessing of a dataset.
cli.sweepsUtilities for handling sweeps over configs for axolotl train CLI command
cli.utilsUtility methods for axolotl CLI.
cli.cloud.basebase class for cloud platforms from cli
cli.cloud.modal_Modal Cloud support from CLI
+
+
+

Trainers

+

Training implementations

+ + + + + + + + + + + + + + + + + + + +
core.trainers.baseModule for customized trainers
core.trainers.trlModule for TRL PPO trainer
core.trainers.dpo.trainerDPO trainer for axolotl
core.trainers.grpo.trainerAxolotl GRPO trainer
+
+
+

Prompt Strategies

+

Prompt formatting strategies

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
prompt_strategies.basemodule for base dataset transform strategies
prompt_strategies.chat_templateHF Chat Templates prompt strategy
prompt_strategies.alpaca_chatModule for Alpaca prompt strategy classes
prompt_strategies.alpaca_instructModule loading the AlpacaInstructPromptTokenizingStrategy class
prompt_strategies.alpaca_w_systemPrompt strategies loader for alpaca instruction datasets with system prompts
prompt_strategies.user_definedUser Defined prompts with configuration from the YML config
prompt_strategies.llama2_chatPrompt Strategy for finetuning Llama2 chat models
prompt_strategies.completionBasic completion text
prompt_strategies.input_outputModule for plain input/output prompt pairs
prompt_strategies.stepwise_supervisedModule for stepwise datasets, typically including a prompt and reasoning traces,
prompt_strategies.metharmeModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class
prompt_strategies.orcaminiPrompt Strategy for finetuning Orca Mini (v2) models
prompt_strategies.pygmalionModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class
prompt_strategies.messages.chatChat dataset wrapping strategy for new internal messages representations
prompt_strategies.dpo.chat_templateDPO prompt strategies for using tokenizer chat templates.
prompt_strategies.dpo.llama3DPO strategies for llama-3 chat template
prompt_strategies.dpo.chatmlDPO strategies for chatml
prompt_strategies.dpo.zephyrDPO strategies for zephyr
prompt_strategies.dpo.user_definedUser-defined DPO strategies
prompt_strategies.dpo.passthroughDPO prompt strategies passthrough/zero-processing strategy
prompt_strategies.kto.llama3KTO strategies for llama-3 chat template
prompt_strategies.kto.chatmlKTO strategies for chatml
prompt_strategies.kto.user_definedUser-defined KTO strategies
prompt_strategies.orpo.chat_templatechatml prompt tokenization strategy for ORPO
prompt_strategies.bradley_terry.llama3chatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template
+
+
+

Kernels

+

Low-level performance optimizations

+ + + + + + + + + + + + + + + + + + + + + + + +
kernels.loraModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.
kernels.gegluModule for definition of GEGLU Triton kernels.
kernels.swigluModule for definition of SwiGLU Triton kernels.
kernels.quantizeDequantization utilities for bitsandbytes integration.
kernels.utilsUtilities for axolotl.kernels submodules.
+
+
+

MonkeyPatches

+

Runtime patches for model optimizations

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
monkeypatch.llama_attn_hijack_flashFlash attention monkey patch for llama model
monkeypatch.llama_attn_hijack_xformersDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
monkeypatch.mistral_attn_hijack_flashFlash attention monkey patch for mistral model
monkeypatch.multipackmultipack patching for v2 of sample packing
monkeypatch.reloraImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.
monkeypatch.llama_expand_maskexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf
monkeypatch.lora_kernelsModule for patching custom LoRA Triton kernels and torch.autograd functions.
monkeypatch.utilsShared utils for the monkeypatches
monkeypatch.btlm_attn_hijack_flashFlash attention monkey patch for cerebras btlm model
monkeypatch.llama_patch_multipackPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
monkeypatch.stablelm_attn_hijack_flashPyTorch StableLM Epoch model.
monkeypatch.trainer_fsdp_optimfix for FSDP optimizer save in trainer w 4.47.0
monkeypatch.transformers_fa_utilssee https://github.com/huggingface/transformers/pull/35834
monkeypatch.unsloth_module for patching with unsloth optimizations
monkeypatch.attention.mllamaMonkeypatch for Vision Llama for FA2 support
monkeypatch.data.batch_dataset_fetchermonkey patches for the dataset fetcher to handle batches of packed indexes
monkeypatch.mixtralPatches to support multipack for mixtral
+
+
+

Utils

+

Utility functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
utils.modelsModule for models and model loading
utils.tokenizationModule for tokenization utilities
utils.chat_templatesThis module provides functionality for selecting chat templates based on user choices.
utils.loramodule to get the state dict of a merged lora model
utils.lora_embeddingshelpers for lora embeddings
utils.model_shard_quantmodule to handle loading model on cpu/meta device for FSDP
utils.benchBenchmarking and measurement utilities
utils.freezemodule to freeze/unfreeze parameters by name
utils.trainerModule containing the Trainer class and related functions
utils.schedulersModule for custom LRScheduler class
utils.distributedutility helpers for distributed checks
utils.dictModule containing the DictDefault class
utils.optimizers.adoptCopied from https://github.com/iShohei220/adopt
utils.data.pretrainingdata handling specific to pretraining
utils.data.sftdata handling specific to SFT
utils.gradient_checkpointing.unslothUnsloth checkpointing
+
+
+

Schemas

+

Pydantic data models for Axolotl config

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
utils.schemas.configModule with Pydantic models for configuration.
utils.schemas.modelPydantic models for model input / output, etc. configuration
utils.schemas.trainingPydantic models for training hyperparameters
utils.schemas.datasetsPydantic models for datasets-related configuration
utils.schemas.peftPydantic models for PEFT-related configuration
utils.schemas.trlPydantic models for TRL trainer configuration
utils.schemas.integrationsPydantic models for Axolotl integrations
utils.schemas.enumsEnums for Axolotl input config
utils.schemas.utilsUtilities for Axolotl Pydantic models
+
+
+

Integrations

+

Third-party integrations and extensions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
integrations.baseBase class for all plugins.
integrations.cut_cross_entropy.argsModule for handling Cut Cross Entropy input arguments.
integrations.grokfast.optimizer
integrations.kd.trainerKD trainer
integrations.liger.argsModule for handling LIGER input arguments.
integrations.lm_eval.argsModule for handling lm eval harness input arguments.
integrations.spectrum.argsModule for handling Spectrum input arguments.
+
+
+

Common

+

Common utilities and shared functionality

+ + + + + + + + + + + + + + + +
common.architecturesCommon architecture specific constants
common.constVarious shared constants
common.datasetsDataset loading utilities.
+
+
+

Models

+

Custom model implementations

+ + + + + + + +
models.mamba.modeling_mamba
+
+
+

Data Processing

+

Data processing utilities

+ + + + + + + + + + + + + + + + + + + + + + + +
utils.collators.corebasic shared collator constants
utils.collators.batchingData collators for axolotl to pad labels and position_ids for packed sequences. Also
utils.collators.mambacollators for Mamba
utils.collators.mm_chatCollators for multi-modal chat messages and packing
utils.samplers.multipackMultipack Batch Sampler
+
+
+

Callbacks

+

Training callbacks

+ + + + + + + + + + + + + + + + + + + + + + + +
utils.callbacks.perplexitycallback to calculate perplexity as an evaluation metric.
utils.callbacks.profilerHF Trainer callback for creating pytorch profiling snapshots
utils.callbacks.lisamodule for LISA
utils.callbacks.mlflow_MLFlow module for trainer callbacks
utils.callbacks.comet_Comet module for trainer callbacks
+ + +
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html new file mode 100644 index 000000000..edb7eb5eb --- /dev/null +++ b/docs/api/integrations.base.html @@ -0,0 +1,1357 @@ + + + + + + + + + +integrations.base – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.base

+

integrations.base

+

Base class for all plugins.

+

A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl. +Plugins can be used to integrate third-party models, modify the training process, or add new features.

+

To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
BaseOptimizerFactoryBase class for factories to create custom optimizers
BasePluginBase class for all plugins. Defines the interface for plugin methods.
PluginManagerThe PluginManager class is responsible for loading and managing plugins.
+
+

BaseOptimizerFactory

+
integrations.base.BaseOptimizerFactory()
+

Base class for factories to create custom optimizers

+
+
+

BasePlugin

+
integrations.base.BasePlugin(self)
+

Base class for all plugins. Defines the interface for plugin methods.

+

Attributes: +None

+

Methods: +register(cfg): Registers the plugin with the given configuration. +pre_model_load(cfg): Performs actions before the model is loaded. +post_model_load(cfg, model): Performs actions after the model is loaded. +pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded. +post_lora_load(cfg, model): Performs actions after LoRA weights are loaded. +create_optimizer(cfg, trainer): Creates and returns an optimizer for training. +create_lr_scheduler(cfg, trainer, optimizer): Creates and returns a learning rate scheduler. +add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training. +add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.

+
+

Methods

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
add_callbacks_post_trainerAdds callbacks to the trainer after creating the trainer.
add_callbacks_pre_trainersetup callbacks before creating the trainer.
create_lr_schedulerCreates and returns a learning rate scheduler.
create_optimizerCreates and returns an optimizer for training.
get_input_argsReturns a pydantic model for the plugin’s input arguments.
get_trainer_clsReturns a custom class for the trainer.
post_lora_loadPerforms actions after LoRA weights are loaded.
post_model_loadPerforms actions after the model is loaded.
post_trainPerforms actions after training is complete.
post_train_unloadPerforms actions after training is complete and the model is unloaded.
pre_lora_loadPerforms actions before LoRA weights are loaded.
pre_model_loadPerforms actions before the model is loaded.
registerRegisters the plugin with the given configuration.
+
+
add_callbacks_post_trainer
+
integrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)
+

Adds callbacks to the trainer after creating the trainer. +This is useful for callbacks that require access to the model or trainer.

+

Parameters: +cfg (dict): The configuration for the plugin. +trainer (object): The trainer object for training.

+

Returns: +List[callable]: A list of callback functions to be added

+
+
+
add_callbacks_pre_trainer
+
integrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)
+

setup callbacks before creating the trainer.

+

Parameters: +cfg (dict): The configuration for the plugin. +model (object): The loaded model.

+

Returns: +List[callable]: A list of callback functions to be added to the TrainingArgs

+
+
+
create_lr_scheduler
+
integrations.base.BasePlugin.create_lr_scheduler(cfg, trainer, optimizer)
+

Creates and returns a learning rate scheduler.

+

Parameters: +cfg (dict): The configuration for the plugin. +trainer (object): The trainer object for training. +optimizer (object): The optimizer for training.

+

Returns: +object: The created learning rate scheduler.

+
+
+
create_optimizer
+
integrations.base.BasePlugin.create_optimizer(cfg, trainer)
+

Creates and returns an optimizer for training.

+

Parameters: +cfg (dict): The configuration for the plugin. +trainer (object): The trainer object for training.

+

Returns: +object: The created optimizer.

+
+
+
get_input_args
+
integrations.base.BasePlugin.get_input_args()
+

Returns a pydantic model for the plugin’s input arguments.

+
+
+
get_trainer_cls
+
integrations.base.BasePlugin.get_trainer_cls(cfg)
+

Returns a custom class for the trainer.

+

Parameters: +cfg (dict): The global axolotl configuration.

+

Returns: +class: The class for the trainer.

+
+
+
post_lora_load
+
integrations.base.BasePlugin.post_lora_load(cfg, model)
+

Performs actions after LoRA weights are loaded.

+

Parameters: +cfg (dict): The configuration for the plugin. +model (object): The loaded model.

+

Returns: +None

+
+
+
post_model_load
+
integrations.base.BasePlugin.post_model_load(cfg, model)
+

Performs actions after the model is loaded.

+

Parameters: +cfg (dict): The configuration for the plugin. +model (object): The loaded model.

+

Returns: +None

+
+
+
post_train
+
integrations.base.BasePlugin.post_train(cfg, model)
+

Performs actions after training is complete.

+

Parameters: +cfg (dict): The axolotl configuration +model (object): The loaded model.

+

Returns: +None

+
+
+
post_train_unload
+
integrations.base.BasePlugin.post_train_unload(cfg)
+

Performs actions after training is complete and the model is unloaded.

+

Parameters: +cfg (dict): The configuration for the plugin.

+

Returns: +None

+
+
+
pre_lora_load
+
integrations.base.BasePlugin.pre_lora_load(cfg, model)
+

Performs actions before LoRA weights are loaded.

+

Parameters: +cfg (dict): The configuration for the plugin. +model (object): The loaded model.

+

Returns: +None

+
+
+
pre_model_load
+
integrations.base.BasePlugin.pre_model_load(cfg)
+

Performs actions before the model is loaded.

+

Parameters: +cfg (dict): The configuration for the plugin.

+

Returns: +None

+
+
+
register
+
integrations.base.BasePlugin.register(cfg)
+

Registers the plugin with the given configuration.

+

Parameters: +cfg (dict): The configuration for the plugin.

+

Returns: +None

+
+
+
+
+

PluginManager

+
integrations.base.PluginManager()
+

The PluginManager class is responsible for loading and managing plugins. +It should be a singleton so it can be accessed from anywhere in the codebase.

+

Attributes: +plugins (ListBasePlugin): A list of loaded plugins.

+

Methods: +get_instance(): Static method to get the singleton instance of PluginManager. +register(plugin_name: str): Registers a new plugin by its name. +pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.

+
+

Methods

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
add_callbacks_post_trainerCalls the add_callbacks_post_trainer method of all registered plugins.
add_callbacks_pre_trainerCalls the add_callbacks_pre_trainer method of all registered plugins.
create_lr_schedulerCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
create_optimizerCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
get_input_argsReturns a list of Pydantic classes for all registered plugins’ input arguments.’
get_instanceReturns the singleton instance of PluginManager.
get_trainer_clsCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.
post_lora_loadCalls the post_lora_load method of all registered plugins.
post_model_loadCalls the post_model_load method of all registered plugins.
post_train_unloadCalls the post_train_unload method of all registered plugins.
pre_lora_loadCalls the pre_lora_load method of all registered plugins.
pre_model_loadCalls the pre_model_load method of all registered plugins.
registerRegisters a new plugin by its name.
+
+
add_callbacks_post_trainer
+
integrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)
+

Calls the add_callbacks_post_trainer method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins. +trainer (object): The trainer object for training.

+

Returns: +List[callable]: A list of callback functions to be added to the TrainingArgs.

+
+
+
add_callbacks_pre_trainer
+
integrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)
+

Calls the add_callbacks_pre_trainer method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins. +model (object): The loaded model.

+

Returns: +List[callable]: A list of callback functions to be added to the TrainingArgs.

+
+
+
create_lr_scheduler
+
integrations.base.PluginManager.create_lr_scheduler(cfg, trainer, optimizer)
+

Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.

+

Parameters: +cfg (dict): The configuration for the plugins. +trainer (object): The trainer object for training. +optimizer (object): The optimizer for training.

+

Returns: +object: The created learning rate scheduler, or None if none was found.

+
+
+
create_optimizer
+
integrations.base.PluginManager.create_optimizer(cfg, trainer)
+

Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.

+

Parameters: +cfg (dict): The configuration for the plugins. +trainer (object): The trainer object for training.

+

Returns: +object: The created optimizer, or None if none was found.

+
+
+
get_input_args
+
integrations.base.PluginManager.get_input_args()
+

Returns a list of Pydantic classes for all registered plugins’ input arguments.’

+

Returns: +list[str]: A list of Pydantic classes for all registered plugins’ input arguments.’

+
+
+
get_instance
+
integrations.base.PluginManager.get_instance()
+

Returns the singleton instance of PluginManager. +If the instance doesn’t exist, it creates a new one.

+
+
+
get_trainer_cls
+
integrations.base.PluginManager.get_trainer_cls(cfg)
+

Calls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.

+

Parameters: +cfg (dict): The configuration for the plugins.

+

Returns: +object: The trainer class, or None if none was found.

+
+
+
post_lora_load
+
integrations.base.PluginManager.post_lora_load(cfg, model)
+

Calls the post_lora_load method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins. +model (object): The loaded model.

+

Returns: +None

+
+
+
post_model_load
+
integrations.base.PluginManager.post_model_load(cfg, model)
+

Calls the post_model_load method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins. +model (object): The loaded model.

+

Returns: +None

+
+
+
post_train_unload
+
integrations.base.PluginManager.post_train_unload(cfg)
+

Calls the post_train_unload method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins. +model (object): The loaded model.

+

Returns: +None

+
+
+
pre_lora_load
+
integrations.base.PluginManager.pre_lora_load(cfg, model)
+

Calls the pre_lora_load method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins. +model (object): The loaded model.

+

Returns: +None

+
+
+
pre_model_load
+
integrations.base.PluginManager.pre_model_load(cfg)
+

Calls the pre_model_load method of all registered plugins.

+

Parameters: +cfg (dict): The configuration for the plugins.

+

Returns: +None

+
+
+
register
+
integrations.base.PluginManager.register(plugin_name)
+

Registers a new plugin by its name.

+

Parameters: +plugin_name (str): The name of the plugin to be registered.

+

Returns: +None

+

Raises: +ImportError: If the plugin module cannot be imported.

+
+
+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
load_pluginLoads a plugin based on the given plugin name.
+
+

load_plugin

+
integrations.base.load_plugin(plugin_name)
+

Loads a plugin based on the given plugin name.

+

The plugin name should be in the format “module_name.class_name”. +This function splits the plugin name into module and class, imports the module, +retrieves the class from the module, and creates an instance of the class.

+

Parameters: +plugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.

+

Returns: +BasePlugin: An instance of the loaded plugin.

+

Raises: +ImportError: If the plugin module cannot be imported.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.cut_cross_entropy.args.html b/docs/api/integrations.cut_cross_entropy.args.html new file mode 100644 index 000000000..fe038036a --- /dev/null +++ b/docs/api/integrations.cut_cross_entropy.args.html @@ -0,0 +1,904 @@ + + + + + + + + + +integrations.cut_cross_entropy.args – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.cut_cross_entropy.args

+

integrations.cut_cross_entropy.args

+

Module for handling Cut Cross Entropy input arguments.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
CutCrossEntropyArgsInput args for Cut Cross Entropy.
+
+

CutCrossEntropyArgs

+
integrations.cut_cross_entropy.args.CutCrossEntropyArgs()
+

Input args for Cut Cross Entropy.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.grokfast.optimizer.html b/docs/api/integrations.grokfast.optimizer.html new file mode 100644 index 000000000..6815a2abd --- /dev/null +++ b/docs/api/integrations.grokfast.optimizer.html @@ -0,0 +1,841 @@ + + + + + + + + + +integrations.grokfast.optimizer – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.grokfast.optimizer

+

integrations.grokfast.optimizer

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.kd.trainer.html b/docs/api/integrations.kd.trainer.html new file mode 100644 index 000000000..4da97496b --- /dev/null +++ b/docs/api/integrations.kd.trainer.html @@ -0,0 +1,939 @@ + + + + + + + + + +integrations.kd.trainer – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.kd.trainer

+

integrations.kd.trainer

+

KD trainer

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
AxolotlKDTrainerCustom trainer subclass for Knowledge Distillation (KD)
+
+

AxolotlKDTrainer

+
integrations.kd.trainer.AxolotlKDTrainer(
+    self,
+    *_args,
+    bench_data_collator=None,
+    eval_data_collator=None,
+    dataset_tags=None,
+    **kwargs,
+)
+

Custom trainer subclass for Knowledge Distillation (KD)

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
compute_lossHow the loss is computed by Trainer. By default, all models return the loss in the first element.
+
+
compute_loss
+
integrations.kd.trainer.AxolotlKDTrainer.compute_loss(
+    model,
+    inputs,
+    return_outputs=False,
+    num_items_in_batch=None,
+)
+

How the loss is computed by Trainer. By default, all models return the loss in the first element.

+

Subclass and override for custom behavior.

+ + +
+
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.liger.args.html b/docs/api/integrations.liger.args.html new file mode 100644 index 000000000..b22e3b2c2 --- /dev/null +++ b/docs/api/integrations.liger.args.html @@ -0,0 +1,904 @@ + + + + + + + + + +integrations.liger.args – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.liger.args

+

integrations.liger.args

+

Module for handling LIGER input arguments.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
LigerArgsInput args for LIGER.
+
+

LigerArgs

+
integrations.liger.args.LigerArgs()
+

Input args for LIGER.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.lm_eval.args.html b/docs/api/integrations.lm_eval.args.html new file mode 100644 index 000000000..e6bc1801d --- /dev/null +++ b/docs/api/integrations.lm_eval.args.html @@ -0,0 +1,904 @@ + + + + + + + + + +integrations.lm_eval.args – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.lm_eval.args

+

integrations.lm_eval.args

+

Module for handling lm eval harness input arguments.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
LMEvalArgsInput args for lm eval harness
+
+

LMEvalArgs

+
integrations.lm_eval.args.LMEvalArgs()
+

Input args for lm eval harness

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/integrations.spectrum.args.html b/docs/api/integrations.spectrum.args.html new file mode 100644 index 000000000..333622832 --- /dev/null +++ b/docs/api/integrations.spectrum.args.html @@ -0,0 +1,904 @@ + + + + + + + + + +integrations.spectrum.args – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

integrations.spectrum.args

+

integrations.spectrum.args

+

Module for handling Spectrum input arguments.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
SpectrumArgsInput args for Spectrum.
+
+

SpectrumArgs

+
integrations.spectrum.args.SpectrumArgs()
+

Input args for Spectrum.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/kernels.geglu.html b/docs/api/kernels.geglu.html new file mode 100644 index 000000000..05a8b8523 --- /dev/null +++ b/docs/api/kernels.geglu.html @@ -0,0 +1,1040 @@ + + + + + + + + + +kernels.geglu – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

kernels.geglu

+

kernels.geglu

+

Module for definition of GEGLU Triton kernels.

+

See “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).

+

Credit to unsloth (https://unsloth.ai/) for inspiration for this implementation.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
geglu_backwardGEGLU backward pass using in-place operations.
geglu_forwardGEGLU forward pass.
+
+

geglu_backward

+
kernels.geglu.geglu_backward(grad_output, gate, up)
+

GEGLU backward pass using in-place operations.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
grad_outputtorch.TensorGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].required
gatetorch.TensorGate tensor from forward pass, shape [batch, seq_len, hidden_dim].required
uptorch.TensorUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, torch.Tensor, torch.Tensor]Tuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)
+
+
+

Note

+

This function modifies its input tensors in-place to store results.

+
+
+
+

geglu_forward

+
kernels.geglu.geglu_forward(gate, up)
+

GEGLU forward pass.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
gatetorch.TensorInput gate tensor of shape [batch, seq_len, hidden_dim].required
uptorch.TensorUp-projection tensor of shape [batch, seq_len, hidden_dim].required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
torch.Tensortorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim].
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html new file mode 100644 index 000000000..2e4b446b1 --- /dev/null +++ b/docs/api/kernels.lora.html @@ -0,0 +1,2100 @@ + + + + + + + + + +kernels.lora – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

kernels.lora

+

kernels.lora

+

Module for definition of Low-Rank Adaptation (LoRA) Triton kernels.

+

See “LoRA: Low-Rank Adaptation of Large Language Models” +(https://arxiv.org/abs/2106.09685).

+

Credit to unsloth (https://unsloth.ai/) for inspiration for this implementation.

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
LoRA_MLPOptimized LoRA MLP implementation.
LoRA_OOptimized LoRA implementation for output projection.
LoRA_QKVOptimized LoRA QKV implementation with quantization support.
+
+

LoRA_MLP

+
kernels.lora.LoRA_MLP()
+

Optimized LoRA MLP implementation.

+
+

Methods

+ + + + + + + + + + + + + + + + + +
NameDescription
backwardPerforms backward pass computation for LoRA MLP.
forwardForward pass for LoRA MLP.
+
+
backward
+
kernels.lora.LoRA_MLP.backward(ctx, grad_output)
+

Performs backward pass computation for LoRA MLP.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
ctxtorch.autograd.function.FunctionCtxContext object storing tensors saved during forward passrequired
grad_outputtorch.TensorGradient of loss with respect to layer outputrequired
+
+
+
Returns
+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
torch.Tensor | NoneTuple containing gradients for all inputs from forward pass:
None- Input gradient tensor (or None)
None- None for weights/quantization states
torch.Tensor | None- LoRA A/B matrix gradients (or None)
torch.Tensor | None- None for scaling factors
None- None for activation functions and flags
+
+
+
+
forward
+
kernels.lora.LoRA_MLP.forward(
+    ctx,
+    X,
+    gate_weight,
+    gate_quant,
+    gate_A,
+    gate_B,
+    gate_scale,
+    up_weight,
+    up_quant,
+    up_A,
+    up_B,
+    up_scale,
+    down_weight,
+    down_quant,
+    down_A,
+    down_B,
+    down_scale,
+    activation_fn,
+    activation_fn_backward,
+    inplace=True,
+)
+

Forward pass for LoRA MLP.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
ctxAutograd contextrequired
Xtorch.TensorInput featuresrequired
gate_weighttorch.TensorGate projection weightrequired
gate_quantobject | NoneGate quantization staterequired
gate_Atorch.Tensor | NoneGate LoRA A matrixrequired
gate_Btorch.Tensor | NoneGate LoRA B matrixrequired
gate_scalefloatGate LoRA scalerequired
up_weighttorch.TensorUp-projection weightrequired
up_quantobject | NoneUp-projection quantization staterequired
up_Atorch.Tensor | NoneUp-projection LoRA A matrixrequired
up_Btorch.Tensor | NoneUp-projection LoRA B matrixrequired
up_scalefloatUp-projection LoRA scalerequired
down_weighttorch.TensorDown-projection weightrequired
down_quantobject | NoneDown-projection quantization staterequired
down_Atorch.Tensor | NoneDown-projection LoRA A matrixrequired
down_Btorch.Tensor | NoneDown-projection LoRA B matrixrequired
down_scalefloatDown-projection LoRA scalerequired
activation_fnCallableForward activation functionrequired
activation_fn_backwardCallableBackward activation functionrequired
inplacebool | NoneWhether to perform operations in-placeTrue
+
+
+
Returns
+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorOutput transformed by multi-layer perceptron and activation function
+
+
+
+
+
+

LoRA_O

+
kernels.lora.LoRA_O()
+

Optimized LoRA implementation for output projection.

+
+

Methods

+ + + + + + + + + + + + + + + + + +
NameDescription
backwardBackward pass computing gradients for LoRA output projection.
forwardForward pass for output projection with LoRA.
+
+
backward
+
kernels.lora.LoRA_O.backward(ctx, dY)
+

Backward pass computing gradients for LoRA output projection.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
ctxtorch.autograd.function.FunctionCtxAutograd contextrequired
dYtorch.TensorGradient of loss with respect to outputrequired
+
+
+
Returns
+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None]Tuple containing gradients for all forward inputs
+
+
+
+
forward
+
kernels.lora.LoRA_O.forward(ctx, X, W, W_quant, A, B, S)
+

Forward pass for output projection with LoRA.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
ctxtorch.autograd.function.FunctionCtxAutograd contextrequired
Xtorch.TensorInput tensorrequired
Wtorch.TensorOutput projection weightrequired
W_quantQuantState | NoneWeight quantization staterequired
Atorch.Tensor | NoneLoRA A matrixrequired
Btorch.Tensor | NoneLoRA B matrixrequired
SfloatLoRA scaling factorrequired
+
+
+
Returns
+ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorOutput projection tensor
+
+
+
+
+
+

LoRA_QKV

+
kernels.lora.LoRA_QKV()
+

Optimized LoRA QKV implementation with quantization support.

+

Implements efficient computation of query, key, value projections with LoRA, +supporting quantization and memory optimization.

+
+

Methods

+ + + + + + + + + + + + + + + + + +
NameDescription
backwardBackward pass computing gradients for LoRA QKV.
forwardForward pass computing Q, K, V projections with LoRA.
+
+
backward
+
kernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)
+

Backward pass computing gradients for LoRA QKV.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
ctxtorch.autograd.function.FunctionCtxAutograd contextrequired
q_gradtorch.TensorGradient for query projectionrequired
k_gradtorch.TensorGradient for key projectionrequired
v_gradtorch.TensorGradient for value projectionrequired
+
+
+
Returns
+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]Tuple containing gradients for all forward inputs
+
+
+
+
forward
+
kernels.lora.LoRA_QKV.forward(
+    ctx,
+    X,
+    q_weight,
+    q_quant,
+    q_A,
+    q_B,
+    q_scale,
+    k_weight,
+    k_quant,
+    k_A,
+    k_B,
+    k_scale,
+    v_weight,
+    v_quant,
+    v_A,
+    v_B,
+    v_scale,
+    inplace=True,
+)
+

Forward pass computing Q, K, V projections with LoRA.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
ctxtorch.autograd.function.FunctionCtxAutograd contextrequired
Xtorch.TensorInput tensorrequired
q_weighttorch.TensorQuery projection weightrequired
q_quantQuantState | NoneQuery quantization staterequired
q_Atorch.Tensor | NoneQuery LoRA A matrixrequired
q_Btorch.Tensor | NoneQuery LoRA B matrixrequired
q_scalefloatQuery LoRA scalerequired
k_weighttorch.TensorKey projection weightrequired
k_quantQuantState | NoneKey quantization staterequired
k_Atorch.Tensor | NoneKey LoRA A matrixrequired
k_Btorch.Tensor | NoneKey LoRA B matrixrequired
k_scalefloatKey LoRA scalerequired
v_weighttorch.TensorValue projection weightrequired
v_quantQuantState | NoneValue quantization staterequired
v_Atorch.Tensor | NoneValue LoRA A matrixrequired
v_Btorch.Tensor | NoneValue LoRA B matrixrequired
v_scalefloatValue LoRA scalerequired
inplaceboolWhether to perform operations in-placeTrue
+
+
+
Returns
+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, torch.Tensor, torch.Tensor]Tuple of (Query, Key, Value) projection tensors
+
+
+
+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
apply_lora_mlp_gegluApplies LoRA to MLP layer with GEGLU activation.
apply_lora_mlp_swigluApplies LoRA to MLP layer with SwiGLU activation.
apply_lora_oApplies LoRA to output projection layer.
apply_lora_qkvApplies LoRA to compute Query, Key, Value projections.
get_lora_parametersGets LoRA parameters from a projection module.
matmul_loraEfficient fused matmul + LoRA computation.
+
+

apply_lora_mlp_geglu

+
kernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)
+

Applies LoRA to MLP layer with GEGLU activation.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
Xtorch.TensorInput tensor for the MLP layerrequired
inplaceboolWhether to perform operations in-place to save memoryTrue
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorOutput tensor after applying LoRA-adapted MLP with GEGLU activation
+
+
+
+

apply_lora_mlp_swiglu

+
kernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)
+

Applies LoRA to MLP layer with SwiGLU activation.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
Xtorch.TensorInput tensor for the MLP layerrequired
inplaceboolWhether to perform operations in-place to save memoryTrue
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorOutput tensor after applying LoRA-adapted MLP with SwiGLU activation
+
+
+
+

apply_lora_o

+
kernels.lora.apply_lora_o(self, X)
+

Applies LoRA to output projection layer.

+
+

Parameters

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
Xtorch.TensorInput tensorrequired
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorTransformed output tensor
+
+
+
+

apply_lora_qkv

+
kernels.lora.apply_lora_qkv(self, X, inplace=True)
+

Applies LoRA to compute Query, Key, Value projections.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
Xtorch.TensorInput tensorrequired
inplaceboolWhether to perform operations in-placeTrue
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, torch.Tensor, torch.Tensor]Tuple of (Query, Key, Value) projection tensors
+
+
+
+

get_lora_parameters

+
kernels.lora.get_lora_parameters(proj)
+

Gets LoRA parameters from a projection module.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
projnn.ModuleThe projection module to extract parameters from.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorA tuple containing the base weight matrix, quantization state, LoRA A matrix,
QuantState | NoneLoRA B matrix, and scaling factor. States and matrices may be None if not
torch.Tensor | Noneavailable.
+
+
+
+

matmul_lora

+
kernels.lora.matmul_lora(X, W, W_quant, A, B, s, out=None)
+

Efficient fused matmul + LoRA computation.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
Xtorch.TensorInput tensor [*, in_features]required
Wtorch.TensorBase weight matrix [out_features, in_features]required
W_quantQuantStateQuantization state for Wrequired
Atorch.TensorLoRA A matrix [rank, in_features]required
Btorch.TensorLoRA B matrix [out_features, rank]required
sfloatLoRA scaling factorrequired
outtorch.Tensor | NoneOptional output tensor for inplace operationsNone
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorResult of X @ W + X @ A @ B
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/kernels.quantize.html b/docs/api/kernels.quantize.html new file mode 100644 index 000000000..728379d94 --- /dev/null +++ b/docs/api/kernels.quantize.html @@ -0,0 +1,1004 @@ + + + + + + + + + +kernels.quantize – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

kernels.quantize

+

kernels.quantize

+

Dequantization utilities for bitsandbytes integration.

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
dequantizeFast NF4 dequantization using bitsandbytes CUDA kernels.
+
+

dequantize

+
kernels.quantize.dequantize(W, quant_state=None, out=None)
+

Fast NF4 dequantization using bitsandbytes CUDA kernels.

+

Performs efficient dequantization of weights from NF4 format using bitsandbytes’ +optimized CUDA implementations. Supports both legacy list and new QuantState +formats.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
Wtorch.TensorQuantized weight tensor to dequantizerequired
quant_stateQuantState | list | NoneQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.None
outtorch.Tensor | NoneOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.None
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if
torch.Tensorinput W was transposed.
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
AssertionErrorIf provided output tensor doesn’t match expected shape / dtype.
+
+
+

Note

+

Uses CUDA streams for better performance when available in newer bitsandbytes +versions (>0.43.3).

+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/kernels.swiglu.html b/docs/api/kernels.swiglu.html new file mode 100644 index 000000000..60e3ab2a3 --- /dev/null +++ b/docs/api/kernels.swiglu.html @@ -0,0 +1,1037 @@ + + + + + + + + + +kernels.swiglu – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

kernels.swiglu

+

kernels.swiglu

+

Module for definition of SwiGLU Triton kernels.

+

See “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).

+

Credit to unsloth (https://unsloth.ai/) for inspiration for this implementation.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
swiglu_backwardSwiGLU backward pass using in-place operations.
swiglu_forwardSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where
+
+

swiglu_backward

+
kernels.swiglu.swiglu_backward(grad_output, gate, up)
+

SwiGLU backward pass using in-place operations.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
grad_outputtorch.TensorGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].required
gatetorch.TensorGate tensor from forward pass, shape [batch, seq_len, hidden_dim].required
uptorch.TensorUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, torch.Tensor, torch.Tensor]Tuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)
+
+
+
+

swiglu_forward

+
kernels.swiglu.swiglu_forward(gate, up)
+

SwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where +x is the gate tensor.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
gatetorch.TensorInput gate tensor of shape [batch, seq_len, hidden_dim].required
uptorch.TensorUp-projection tensor of shape [batch, seq_len, hidden_dim].required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorOutput tensor of shape [batch, seq_len, hidden_dim].
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/kernels.utils.html b/docs/api/kernels.utils.html new file mode 100644 index 000000000..12ed0665a --- /dev/null +++ b/docs/api/kernels.utils.html @@ -0,0 +1,842 @@ + + + + + + + + + +kernels.utils – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

kernels.utils

+

kernels.utils

+

Utilities for axolotl.kernels submodules.

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/logging_config.html b/docs/api/logging_config.html new file mode 100644 index 000000000..de6ec9b16 --- /dev/null +++ b/docs/api/logging_config.html @@ -0,0 +1,930 @@ + + + + + + + + + +logging_config – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

logging_config

+

logging_config

+

Common logging module for axolotl

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
ColorfulFormatterFormatter to add coloring to log messages by log type
+
+

ColorfulFormatter

+
logging_config.ColorfulFormatter()
+

Formatter to add coloring to log messages by log type

+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
configure_loggingConfigure with default logging
+
+

configure_logging

+
logging_config.configure_logging()
+

Configure with default logging

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/models.mamba.modeling_mamba.html b/docs/api/models.mamba.modeling_mamba.html new file mode 100644 index 000000000..55fdb3fef --- /dev/null +++ b/docs/api/models.mamba.modeling_mamba.html @@ -0,0 +1,841 @@ + + + + + + + + + +models.mamba.modeling_mamba – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

models.mamba.modeling_mamba

+

models.mamba.modeling_mamba

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.attention.mllama.html b/docs/api/monkeypatch.attention.mllama.html new file mode 100644 index 000000000..a72a4cb90 --- /dev/null +++ b/docs/api/monkeypatch.attention.mllama.html @@ -0,0 +1,926 @@ + + + + + + + + + +monkeypatch.attention.mllama – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.attention.mllama

+

monkeypatch.attention.mllama

+

Monkeypatch for Vision Llama for FA2 support

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
MllamaTextCrossFlashAttention2Mllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and
MllamaTextSelfFlashAttention2Mllama flash self-attention module. This module inherits from MllamaTextSelfAttention and
+
+

MllamaTextCrossFlashAttention2

+
monkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(
+    self,
+    *args,
+    **kwargs,
+)
+

Mllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and +implements the forward pass using Flash Attention for improved performance.

+
+
+

MllamaTextSelfFlashAttention2

+
monkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(
+    self,
+    config,
+    layer_idx,
+    *args,
+    **kwargs,
+)
+

Mllama flash self-attention module. This module inherits from MllamaTextSelfAttention and +implements the forward pass using Flash Attention for improved performance.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.btlm_attn_hijack_flash.html b/docs/api/monkeypatch.btlm_attn_hijack_flash.html new file mode 100644 index 000000000..73c9db6ae --- /dev/null +++ b/docs/api/monkeypatch.btlm_attn_hijack_flash.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.btlm_attn_hijack_flash – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.btlm_attn_hijack_flash

+

monkeypatch.btlm_attn_hijack_flash

+

Flash attention monkey patch for cerebras btlm model

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.data.batch_dataset_fetcher.html b/docs/api/monkeypatch.data.batch_dataset_fetcher.html new file mode 100644 index 000000000..4758b2d10 --- /dev/null +++ b/docs/api/monkeypatch.data.batch_dataset_fetcher.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.data.batch_dataset_fetcher – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.data.batch_dataset_fetcher

+

monkeypatch.data.batch_dataset_fetcher

+

monkey patches for the dataset fetcher to handle batches of packed indexes

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.llama_attn_hijack_flash.html b/docs/api/monkeypatch.llama_attn_hijack_flash.html new file mode 100644 index 000000000..4c990051a --- /dev/null +++ b/docs/api/monkeypatch.llama_attn_hijack_flash.html @@ -0,0 +1,1127 @@ + + + + + + + + + +monkeypatch.llama_attn_hijack_flash – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.llama_attn_hijack_flash

+

monkeypatch.llama_attn_hijack_flash

+

Flash attention monkey patch for llama model

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
FusedAttentionFused QKV Attention layer for incrementally improved training efficiency
LlamaDecoderLayerpatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens
+
+

FusedAttention

+
monkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)
+

Fused QKV Attention layer for incrementally improved training efficiency

+
+
+

LlamaDecoderLayer

+
monkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()
+

patched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
forward
+
+
forward
+
monkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(
+    hidden_states,
+    attention_mask=None,
+    position_ids=None,
+    past_key_value=None,
+    output_attentions=False,
+    use_cache=False,
+    padding_mask=None,
+    cu_seqlens=None,
+    max_seqlen=None,
+)
+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
hidden_statestorch.FloatTensorinput to the layer of shape (batch, seq_len, embed_dim)required
attention_masktorch.FloatTensor, optionalattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.None
output_attentionsbool, optionalWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.False
use_cachebool, optionalIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).False
past_key_valueTuple(torch.FloatTensor), optionalcached past key and value projection statesNone
+
+
+
+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
flashattn_forwardInput shape: Batch x Time x Channel
flashattn_forward_with_s2attnInput shape: Batch x Time x Channel
generate_qkv
+
+

flashattn_forward

+
monkeypatch.llama_attn_hijack_flash.flashattn_forward(
+    self,
+    hidden_states,
+    attention_mask=None,
+    position_ids=None,
+    past_key_value=None,
+    output_attentions=False,
+    use_cache=False,
+    padding_mask=None,
+    cu_seqlens=None,
+    max_seqlen=None,
+)
+

Input shape: Batch x Time x Channel

+

attention_mask: [bsz, q_len]

+
+
+

flashattn_forward_with_s2attn

+
monkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(
+    self,
+    hidden_states,
+    attention_mask=None,
+    position_ids=None,
+    past_key_value=None,
+    output_attentions=False,
+    use_cache=False,
+    padding_mask=None,
+    cu_seqlens=None,
+    max_seqlen=None,
+)
+

Input shape: Batch x Time x Channel

+

From: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py

+

attention_mask: [bsz, q_len]

+

cu_seqlens will be ignored if provided +max_seqlen will be ignored if provided

+
+
+

generate_qkv

+
monkeypatch.llama_attn_hijack_flash.generate_qkv(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    kvpacked=False,
+    qkvpacked=False,
+)
+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
q(batch_size, seqlen_q, nheads, d)required
k(batch_size, seqlen_k, nheads_k, d)required
v(batch_size, seqlen_k, nheads_k, d)required
query_padding_mask(batch_size, seqlen), boolNone
key_padding_mask(batch_size, seqlen), boolNone
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.llama_attn_hijack_xformers.html b/docs/api/monkeypatch.llama_attn_hijack_xformers.html new file mode 100644 index 000000000..728f56493 --- /dev/null +++ b/docs/api/monkeypatch.llama_attn_hijack_xformers.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.llama_attn_hijack_xformers – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.llama_attn_hijack_xformers

+

monkeypatch.llama_attn_hijack_xformers

+

Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.llama_expand_mask.html b/docs/api/monkeypatch.llama_expand_mask.html new file mode 100644 index 000000000..59fa53579 --- /dev/null +++ b/docs/api/monkeypatch.llama_expand_mask.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.llama_expand_mask – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.llama_expand_mask

+

monkeypatch.llama_expand_mask

+

expands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.llama_patch_multipack.html b/docs/api/monkeypatch.llama_patch_multipack.html new file mode 100644 index 000000000..99e86a2b0 --- /dev/null +++ b/docs/api/monkeypatch.llama_patch_multipack.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.llama_patch_multipack – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.llama_patch_multipack

+

monkeypatch.llama_patch_multipack

+

Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.lora_kernels.html b/docs/api/monkeypatch.lora_kernels.html new file mode 100644 index 000000000..396e030bb --- /dev/null +++ b/docs/api/monkeypatch.lora_kernels.html @@ -0,0 +1,1288 @@ + + + + + + + + + +monkeypatch.lora_kernels – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.lora_kernels

+

monkeypatch.lora_kernels

+

Module for patching custom LoRA Triton kernels and torch.autograd functions.

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
apply_lora_kernel_patchesApplies optimized Triton kernel patches to a PEFT model.
get_attention_cls_from_configGet the appropriate attention class by inspecting the model config.
original_apply_oOriginal implementation of output projection without optimizations.
original_apply_qkvOriginal implementation of QKV projection without optimizations.
patch_self_attn_loraGiven an axolotl config, this method patches the inferred attention class forward
+
+

apply_lora_kernel_patches

+
monkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)
+

Applies optimized Triton kernel patches to a PEFT model.

+

Patches a PEFT model with optimized implementations for MLP and attention +computations. The optimizations include custom Triton kernels for activation +functions and specialized autograd functions for LoRA computations.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
modelPeftModelForCausalLMA PEFT model to be patched with optimized kernels.required
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
PeftModelForCausalLMPeftModelForCausalLMThe patched model with optimized kernels.
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
TypeErrorIf the provided model is not a PeftModelForCausalLM.
NotImplementedErrorIf the model type is not supported.
AssertionErrorIf multiple adapters are active (currently unsupported).
+
+
+

Note

+

The optimizations require LoRA adapters with no dropout and no bias terms. The +function will skip patching if these conditions aren’t met.

+
+
+
+

get_attention_cls_from_config

+
monkeypatch.lora_kernels.get_attention_cls_from_config(cfg)
+

Get the appropriate attention class by inspecting the model config. +Uses dynamic import to support any model architecture that follows +the standard transformers naming convention.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
Type[nn.Module]The appropriate attention class for the model.
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf base_model not specified or attention class cannot be imported
ImportErrorIf the model module or attention class doesn’t exist
+
+
+
+

original_apply_o

+
monkeypatch.lora_kernels.original_apply_o(self, hidden_states)
+

Original implementation of output projection without optimizations.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
selfnn.ModuleThe attention module instance.required
hidden_statestorch.TensorInput tensor of shape [batch_size, seq_len, hidden_dim]`.required
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorThe output projection result.
+
+
+
+

original_apply_qkv

+
monkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)
+

Original implementation of QKV projection without optimizations.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
selfnn.ModuleThe attention module instance.required
hidden_statestorch.TensorInput tensor of shape [batch_size, seq_len, hidden_dim].required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[torch.Tensor, torch.Tensor, torch.Tensor]A tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.
+
+
+
+

patch_self_attn_lora

+
monkeypatch.lora_kernels.patch_self_attn_lora(cfg)
+

Given an axolotl config, this method patches the inferred attention class forward +pass with optimized LoRA implementations.

+

It modifies the attention class to use optimized QKV and output projections. The +original implementation is preserved and can be restored if needed.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
AssertionErrorIf the required code blocks are not found in the attention implementation.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.mistral_attn_hijack_flash.html b/docs/api/monkeypatch.mistral_attn_hijack_flash.html new file mode 100644 index 000000000..3519cd162 --- /dev/null +++ b/docs/api/monkeypatch.mistral_attn_hijack_flash.html @@ -0,0 +1,1069 @@ + + + + + + + + + +monkeypatch.mistral_attn_hijack_flash – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.mistral_attn_hijack_flash

+

monkeypatch.mistral_attn_hijack_flash

+

Flash attention monkey patch for mistral model

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
MistralDecoderLayerpatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens
+
+

MistralDecoderLayer

+
monkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()
+

patched version of MistralDecoderLayer to pass through the precalculated cu_seqlens

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
forward
+
+
forward
+
monkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(
+    hidden_states,
+    attention_mask=None,
+    position_ids=None,
+    past_key_value=None,
+    output_attentions=False,
+    use_cache=False,
+    cu_seqlens=None,
+    max_seqlen=None,
+)
+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
hidden_statestorch.FloatTensorinput to the layer of shape (batch, seq_len, embed_dim)required
attention_masktorch.FloatTensor, optionalattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.None
output_attentionsbool, optionalWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.False
use_cachebool, optionalIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).False
past_key_valueTuple(torch.FloatTensor), optionalcached past key and value projection statesNone
+
+
+
+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
generate_qkv
+
+

generate_qkv

+
monkeypatch.mistral_attn_hijack_flash.generate_qkv(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    kvpacked=False,
+    qkvpacked=False,
+)
+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
q(batch_size, seqlen_q, nheads, d)required
k(batch_size, seqlen_k, nheads_k, d)required
v(batch_size, seqlen_k, nheads_k, d)required
query_padding_mask(batch_size, seqlen), boolNone
key_padding_mask(batch_size, seqlen), boolNone
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.mixtral.html b/docs/api/monkeypatch.mixtral.html new file mode 100644 index 000000000..8f873daa4 --- /dev/null +++ b/docs/api/monkeypatch.mixtral.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.mixtral – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.mixtral

+

monkeypatch.mixtral

+

Patches to support multipack for mixtral

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.multipack.html b/docs/api/monkeypatch.multipack.html new file mode 100644 index 000000000..ef5cbbb9b --- /dev/null +++ b/docs/api/monkeypatch.multipack.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.multipack – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.multipack

+

monkeypatch.multipack

+

multipack patching for v2 of sample packing

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.relora.html b/docs/api/monkeypatch.relora.html new file mode 100644 index 000000000..68c74b125 --- /dev/null +++ b/docs/api/monkeypatch.relora.html @@ -0,0 +1,922 @@ + + + + + + + + + +monkeypatch.relora – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.relora

+

monkeypatch.relora

+

Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
ReLoRACallbackCallback to merge LoRA weights into the base model and save full-weight checkpoints
ReLoRASchedulerWraps another scheduler to apply per-lora-restart learning rate warmups.
+
+

ReLoRACallback

+
monkeypatch.relora.ReLoRACallback(self, cfg)
+

Callback to merge LoRA weights into the base model and save full-weight checkpoints

+
+
+

ReLoRAScheduler

+
monkeypatch.relora.ReLoRAScheduler(
+    self,
+    optimizer,
+    inner_schedule,
+    relora_steps,
+    warmup_steps,
+    anneal_steps=1,
+    min_lr_scale=0.001,
+)
+

Wraps another scheduler to apply per-lora-restart learning rate warmups.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html new file mode 100644 index 000000000..545fd8e83 --- /dev/null +++ b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html @@ -0,0 +1,915 @@ + + + + + + + + + +monkeypatch.stablelm_attn_hijack_flash – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.stablelm_attn_hijack_flash

+

monkeypatch.stablelm_attn_hijack_flash

+

PyTorch StableLM Epoch model.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
repeat_kvThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
rotate_halfRotates half the hidden dims of the input.
+
+

repeat_kv

+
monkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)
+

This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, +num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)

+
+
+

rotate_half

+
monkeypatch.stablelm_attn_hijack_flash.rotate_half(x)
+

Rotates half the hidden dims of the input.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.trainer_fsdp_optim.html b/docs/api/monkeypatch.trainer_fsdp_optim.html new file mode 100644 index 000000000..6d7a99aef --- /dev/null +++ b/docs/api/monkeypatch.trainer_fsdp_optim.html @@ -0,0 +1,904 @@ + + + + + + + + + +monkeypatch.trainer_fsdp_optim – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.trainer_fsdp_optim

+

monkeypatch.trainer_fsdp_optim

+

fix for FSDP optimizer save in trainer w 4.47.0

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
patch_training_loop_for_fsdpmonkeypatch for fixing the training loop for fsdp with optimizer save
+
+

patch_training_loop_for_fsdp

+
monkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()
+

monkeypatch for fixing the training loop for fsdp with optimizer save

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.transformers_fa_utils.html b/docs/api/monkeypatch.transformers_fa_utils.html new file mode 100644 index 000000000..731302f53 --- /dev/null +++ b/docs/api/monkeypatch.transformers_fa_utils.html @@ -0,0 +1,964 @@ + + + + + + + + + +monkeypatch.transformers_fa_utils – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.transformers_fa_utils

+

monkeypatch.transformers_fa_utils

+

see https://github.com/huggingface/transformers/pull/35834

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
fixed_fa_peft_integration_checkPEFT usually casts the layer norms in float32 for training stability reasons
+
+

fixed_fa_peft_integration_check

+
monkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(
+    query,
+    key,
+    value,
+    target_dtype=None,
+    preferred_dtype=None,
+)
+

PEFT usually casts the layer norms in float32 for training stability reasons +therefore the input hidden states gets silently casted in float32. Hence, we need +cast them back in float16 / bfloat16 just to be sure everything works as expected. +This might slowdown training & inference so it is recommended to not cast the LayerNorms!

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
querytorch.TensorInput query states to be passed to Flash Attention APIrequired
keytorch.TensorInput key states to be passed to Flash Attention APIrequired
valuetorch.TensorInput value states to be passed to Flash Attention APIrequired
target_dtypetorch.dtype, optionalThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.None
preferred_dtypetorch.dtype, optionalThe preferred dtype to convert the attention tensors to regardless of the target dtype.None
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.unsloth_.html b/docs/api/monkeypatch.unsloth_.html new file mode 100644 index 000000000..ab34c497a --- /dev/null +++ b/docs/api/monkeypatch.unsloth_.html @@ -0,0 +1,842 @@ + + + + + + + + + +monkeypatch.unsloth_ – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.unsloth_

+

monkeypatch.unsloth_

+

module for patching with unsloth optimizations

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/monkeypatch.utils.html b/docs/api/monkeypatch.utils.html new file mode 100644 index 000000000..5b38f35b6 --- /dev/null +++ b/docs/api/monkeypatch.utils.html @@ -0,0 +1,927 @@ + + + + + + + + + +monkeypatch.utils – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

monkeypatch.utils

+

monkeypatch.utils

+

Shared utils for the monkeypatches

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
get_cu_seqlensgenerate a cumulative sequence length mask for flash attention using attn mask
get_cu_seqlens_from_pos_idsgenerate a cumulative sequence length mask for flash attention using pos ids
mask_2d_to_4dExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].
+
+

get_cu_seqlens

+
monkeypatch.utils.get_cu_seqlens(attn_mask)
+

generate a cumulative sequence length mask for flash attention using attn mask

+
+
+

get_cu_seqlens_from_pos_ids

+
monkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)
+

generate a cumulative sequence length mask for flash attention using pos ids

+
+
+

mask_2d_to_4d

+
monkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)
+

Expands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len]. +This expansion handles packed sequences so that sequences share the same attention mask integer value +when they attend to each other within that sequence. +This expansion transforms the mask to lower triangular form to prevent future peeking.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.alpaca_chat.html b/docs/api/prompt_strategies.alpaca_chat.html new file mode 100644 index 000000000..0b9614be9 --- /dev/null +++ b/docs/api/prompt_strategies.alpaca_chat.html @@ -0,0 +1,959 @@ + + + + + + + + + +prompt_strategies.alpaca_chat – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.alpaca_chat

+

prompt_strategies.alpaca_chat

+

Module for Alpaca prompt strategy classes

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
AlpacaChatPrompterAlpaca Chat Prompter extending the system prompt to for chat-instruct answers
AlpacaConcisePrompterAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers
AlpacaQAPromptTokenizingStrategyTokenizing strategy for AlpacaQA
CamelAIPromptTokenizingStrategyTokenizing strategy for CamelAI datasets
NoSystemPrompterNull Prompter with no system prompts
+
+

AlpacaChatPrompter

+
prompt_strategies.alpaca_chat.AlpacaChatPrompter(self)
+

Alpaca Chat Prompter extending the system prompt to for chat-instruct answers

+
+
+

AlpacaConcisePrompter

+
prompt_strategies.alpaca_chat.AlpacaConcisePrompter(
+    self,
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+

Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers

+
+
+

AlpacaQAPromptTokenizingStrategy

+
prompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for AlpacaQA

+
+
+

CamelAIPromptTokenizingStrategy

+
prompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for CamelAI datasets

+
+
+

NoSystemPrompter

+
prompt_strategies.alpaca_chat.NoSystemPrompter(self)
+

Null Prompter with no system prompts

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.alpaca_instruct.html b/docs/api/prompt_strategies.alpaca_instruct.html new file mode 100644 index 000000000..bd41d44b0 --- /dev/null +++ b/docs/api/prompt_strategies.alpaca_instruct.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.alpaca_instruct – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.alpaca_instruct

+

prompt_strategies.alpaca_instruct

+

Module loading the AlpacaInstructPromptTokenizingStrategy class

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.alpaca_w_system.html b/docs/api/prompt_strategies.alpaca_w_system.html new file mode 100644 index 000000000..8af301a47 --- /dev/null +++ b/docs/api/prompt_strategies.alpaca_w_system.html @@ -0,0 +1,952 @@ + + + + + + + + + +prompt_strategies.alpaca_w_system – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.alpaca_w_system

+

prompt_strategies.alpaca_w_system

+

Prompt strategies loader for alpaca instruction datasets with system prompts

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
InstructionWSystemPromptTokenizingStrategyTokenizing strategy for instruction-based prompts.
OpenOrcaPromptTokenizingStrategyTokenizing strategy for OpenOrca datasets
OpenOrcaSystemDataPrompterAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts
SystemDataPrompterAlpaca Style Prompter that uses system prompts from the dataset
+
+

InstructionWSystemPromptTokenizingStrategy

+
prompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for instruction-based prompts.

+
+
+

OpenOrcaPromptTokenizingStrategy

+
prompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for OpenOrca datasets

+
+
+

OpenOrcaSystemDataPrompter

+
prompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(
+    self,
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+

Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts

+
+
+

SystemDataPrompter

+
prompt_strategies.alpaca_w_system.SystemDataPrompter(
+    self,
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+

Alpaca Style Prompter that uses system prompts from the dataset

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.base.html b/docs/api/prompt_strategies.base.html new file mode 100644 index 000000000..bd41bbd80 --- /dev/null +++ b/docs/api/prompt_strategies.base.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.base – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.base

+

prompt_strategies.base

+

module for base dataset transform strategies

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.bradley_terry.llama3.html b/docs/api/prompt_strategies.bradley_terry.llama3.html new file mode 100644 index 000000000..bbdd60753 --- /dev/null +++ b/docs/api/prompt_strategies.bradley_terry.llama3.html @@ -0,0 +1,905 @@ + + + + + + + + + +prompt_strategies.bradley_terry.llama3 – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.bradley_terry.llama3

+

prompt_strategies.bradley_terry.llama3

+

chatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
icrchatml transforms for datasets with system, input, chosen, rejected
+
+

icr

+
prompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)
+

chatml transforms for datasets with system, input, chosen, rejected +ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.chat_template.html b/docs/api/prompt_strategies.chat_template.html new file mode 100644 index 000000000..f644357e0 --- /dev/null +++ b/docs/api/prompt_strategies.chat_template.html @@ -0,0 +1,975 @@ + + + + + + + + + +prompt_strategies.chat_template – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.chat_template

+

prompt_strategies.chat_template

+

HF Chat Templates prompt strategy

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
ChatTemplatePrompterPrompter for HF chat templates
ChatTemplateStrategyTokenizing strategy for instruction-based prompts.
StrategyLoaderLoad chat template strategy based on configuration.
+
+

ChatTemplatePrompter

+
prompt_strategies.chat_template.ChatTemplatePrompter(
+    self,
+    tokenizer,
+    chat_template,
+    processor=None,
+    max_length=2048,
+    message_property_mappings=None,
+    message_field_training=None,
+    message_field_training_detail=None,
+    field_messages='messages',
+    roles=None,
+    drop_system_message=False,
+)
+

Prompter for HF chat templates

+
+
+

ChatTemplateStrategy

+
prompt_strategies.chat_template.ChatTemplateStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs,
+    sequence_len,
+    roles_to_train=None,
+    train_on_eos=None,
+)
+

Tokenizing strategy for instruction-based prompts.

+
+

Methods

+ + + + + + + + + + + + + + + + + +
NameDescription
find_turnLocate the starting and ending indices of the specified turn in a conversation.
tokenize_promptPublic method that can handle either a single prompt or a batch of prompts.
+
+
find_turn
+
prompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)
+

Locate the starting and ending indices of the specified turn in a conversation.

+
+
+
tokenize_prompt
+
prompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)
+

Public method that can handle either a single prompt or a batch of prompts.

+
+
+
+
+

StrategyLoader

+
prompt_strategies.chat_template.StrategyLoader()
+

Load chat template strategy based on configuration.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.completion.html b/docs/api/prompt_strategies.completion.html new file mode 100644 index 000000000..c96b7bd02 --- /dev/null +++ b/docs/api/prompt_strategies.completion.html @@ -0,0 +1,919 @@ + + + + + + + + + +prompt_strategies.completion – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.completion

+

prompt_strategies.completion

+

Basic completion text

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
CompletionPromptTokenizingStrategyTokenizing strategy for Completion prompts.
CompletionPrompterPrompter for completion
+
+

CompletionPromptTokenizingStrategy

+
prompt_strategies.completion.CompletionPromptTokenizingStrategy(
+    self,
+    *args,
+    max_length=None,
+    **kwargs,
+)
+

Tokenizing strategy for Completion prompts.

+
+
+

CompletionPrompter

+
prompt_strategies.completion.CompletionPrompter()
+

Prompter for completion

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html new file mode 100644 index 000000000..92b36fc0f --- /dev/null +++ b/docs/api/prompt_strategies.dpo.chat_template.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.dpo.chat_template – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.dpo.chat_template

+

prompt_strategies.dpo.chat_template

+

DPO prompt strategies for using tokenizer chat templates.

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.chatml.html b/docs/api/prompt_strategies.dpo.chatml.html new file mode 100644 index 000000000..a72582e8c --- /dev/null +++ b/docs/api/prompt_strategies.dpo.chatml.html @@ -0,0 +1,935 @@ + + + + + + + + + +prompt_strategies.dpo.chatml – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.dpo.chatml

+

prompt_strategies.dpo.chatml

+

DPO strategies for chatml

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
argilla_chatfor argilla/dpo-mix-7k conversations
icrchatml transforms for datasets with system, input, chosen, rejected
intelFor Intel Orca DPO Pairs
ultrafor ultrafeedback binarized conversations
+
+

argilla_chat

+
prompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)
+

for argilla/dpo-mix-7k conversations

+
+
+

icr

+
prompt_strategies.dpo.chatml.icr(cfg, **kwargs)
+

chatml transforms for datasets with system, input, chosen, rejected +ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs

+
+
+

intel

+
prompt_strategies.dpo.chatml.intel(cfg, **kwargs)
+

For Intel Orca DPO Pairs

+
+
+

ultra

+
prompt_strategies.dpo.chatml.ultra(cfg, **kwargs)
+

for ultrafeedback binarized conversations

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.llama3.html b/docs/api/prompt_strategies.dpo.llama3.html new file mode 100644 index 000000000..4b165a255 --- /dev/null +++ b/docs/api/prompt_strategies.dpo.llama3.html @@ -0,0 +1,935 @@ + + + + + + + + + +prompt_strategies.dpo.llama3 – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.dpo.llama3

+

prompt_strategies.dpo.llama3

+

DPO strategies for llama-3 chat template

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
argilla_chatfor argilla/dpo-mix-7k conversations
icrchatml transforms for datasets with system, input, chosen, rejected
intelFor Intel Orca DPO Pairs
ultrafor ultrafeedback binarized conversations
+
+

argilla_chat

+
prompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)
+

for argilla/dpo-mix-7k conversations

+
+
+

icr

+
prompt_strategies.dpo.llama3.icr(cfg, **kwargs)
+

chatml transforms for datasets with system, input, chosen, rejected +ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs

+
+
+

intel

+
prompt_strategies.dpo.llama3.intel(cfg, **kwargs)
+

For Intel Orca DPO Pairs

+
+
+

ultra

+
prompt_strategies.dpo.llama3.ultra(cfg, **kwargs)
+

for ultrafeedback binarized conversations

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.passthrough.html b/docs/api/prompt_strategies.dpo.passthrough.html new file mode 100644 index 000000000..ffd812d92 --- /dev/null +++ b/docs/api/prompt_strategies.dpo.passthrough.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.dpo.passthrough – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.dpo.passthrough

+

prompt_strategies.dpo.passthrough

+

DPO prompt strategies passthrough/zero-processing strategy

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.user_defined.html b/docs/api/prompt_strategies.dpo.user_defined.html new file mode 100644 index 000000000..a395ad3cd --- /dev/null +++ b/docs/api/prompt_strategies.dpo.user_defined.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.dpo.user_defined – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.dpo.user_defined

+

prompt_strategies.dpo.user_defined

+

User-defined DPO strategies

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.zephyr.html b/docs/api/prompt_strategies.dpo.zephyr.html new file mode 100644 index 000000000..c15e86f6d --- /dev/null +++ b/docs/api/prompt_strategies.dpo.zephyr.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.dpo.zephyr – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.dpo.zephyr

+

prompt_strategies.dpo.zephyr

+

DPO strategies for zephyr

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.input_output.html b/docs/api/prompt_strategies.input_output.html new file mode 100644 index 000000000..0831d0efd --- /dev/null +++ b/docs/api/prompt_strategies.input_output.html @@ -0,0 +1,919 @@ + + + + + + + + + +prompt_strategies.input_output – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.input_output

+

prompt_strategies.input_output

+

Module for plain input/output prompt pairs

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
RawInputOutputPrompterprompter for raw i/o data
RawInputOutputStrategyPrompt Strategy class for input/output pairs
+
+

RawInputOutputPrompter

+
prompt_strategies.input_output.RawInputOutputPrompter()
+

prompter for raw i/o data

+
+
+

RawInputOutputStrategy

+
prompt_strategies.input_output.RawInputOutputStrategy(
+    self,
+    *args,
+    eos_token=None,
+    **kwargs,
+)
+

Prompt Strategy class for input/output pairs

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.kto.chatml.html b/docs/api/prompt_strategies.kto.chatml.html new file mode 100644 index 000000000..d5f10fabf --- /dev/null +++ b/docs/api/prompt_strategies.kto.chatml.html @@ -0,0 +1,926 @@ + + + + + + + + + +prompt_strategies.kto.chatml – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.kto.chatml

+

prompt_strategies.kto.chatml

+

KTO strategies for chatml

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
argilla_chatfor argilla/kto-mix-15k conversations
intelFor Intel Orca KTO
ultrafor ultrafeedback binarized conversations
+
+

argilla_chat

+
prompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)
+

for argilla/kto-mix-15k conversations

+
+
+

intel

+
prompt_strategies.kto.chatml.intel(cfg, **kwargs)
+

For Intel Orca KTO +ex: argilla/distilabel-intel-orca-kto

+
+
+

ultra

+
prompt_strategies.kto.chatml.ultra(cfg, **kwargs)
+

for ultrafeedback binarized conversations +ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.kto.llama3.html b/docs/api/prompt_strategies.kto.llama3.html new file mode 100644 index 000000000..bf5ebd206 --- /dev/null +++ b/docs/api/prompt_strategies.kto.llama3.html @@ -0,0 +1,926 @@ + + + + + + + + + +prompt_strategies.kto.llama3 – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.kto.llama3

+

prompt_strategies.kto.llama3

+

KTO strategies for llama-3 chat template

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
argilla_chatfor argilla/kto-mix-15k conversations
intelFor Intel Orca KTO
ultrafor ultrafeedback binarized conversations
+
+

argilla_chat

+
prompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)
+

for argilla/kto-mix-15k conversations

+
+
+

intel

+
prompt_strategies.kto.llama3.intel(cfg, **kwargs)
+

For Intel Orca KTO +ex: argilla/distilabel-intel-orca-kto

+
+
+

ultra

+
prompt_strategies.kto.llama3.ultra(cfg, **kwargs)
+

for ultrafeedback binarized conversations +ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.kto.user_defined.html b/docs/api/prompt_strategies.kto.user_defined.html new file mode 100644 index 000000000..1629c733c --- /dev/null +++ b/docs/api/prompt_strategies.kto.user_defined.html @@ -0,0 +1,842 @@ + + + + + + + + + +prompt_strategies.kto.user_defined – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.kto.user_defined

+

prompt_strategies.kto.user_defined

+

User-defined KTO strategies

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.llama2_chat.html b/docs/api/prompt_strategies.llama2_chat.html new file mode 100644 index 000000000..76373d623 --- /dev/null +++ b/docs/api/prompt_strategies.llama2_chat.html @@ -0,0 +1,984 @@ + + + + + + + + + +prompt_strategies.llama2_chat – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.llama2_chat

+

prompt_strategies.llama2_chat

+

Prompt Strategy for finetuning Llama2 chat models +see also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.

+

This implementation is based on the Vicuna PR and the fastchat repo, see also: +https://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847

+

Use dataset type: “llama2_chat” in conig.yml to use this prompt style.

+

E.g. in the config.yml:

+
datasets:
+  - path: llama_finetune_train.jsonl
+    type: llama2_chat
+

The dataset itself should look like this:

+
{'conversations':[{"from": "human", "value": "Who are you?"}, {"from": "gpt", "value": "I am Vicuna"},...]}
+

in a jsonl file. The first message should be from the human, the second from gpt. +For a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).

+

Important: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
LLama2ChatTokenizingStrategyTokenizing strategy for Llama2 prompts.
Llama2ChatConversationA class that manages prompt templates and keeps all conversation history.
Llama2ChatPrompterA prompter that generates prompts for Llama2 models.
+
+

LLama2ChatTokenizingStrategy

+
prompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(
+    self,
+    *args,
+    **kwargs,
+)
+

Tokenizing strategy for Llama2 prompts. +adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py

+
+
+

Llama2ChatConversation

+
prompt_strategies.llama2_chat.Llama2ChatConversation(
+    self,
+    name='llama2',
+    system="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n",
+    roles=('[INST]', '[/INST]'),
+    messages=list(),
+    offset=0,
+)
+

A class that manages prompt templates and keeps all conversation history. +copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py

+
+

Methods

+ + + + + + + + + + + + + + + + + +
NameDescription
append_messageAppend a new message.
get_promptGet the prompt for generation.
+
+
append_message
+
prompt_strategies.llama2_chat.Llama2ChatConversation.append_message(
+    role,
+    message,
+)
+

Append a new message.

+
+
+
get_prompt
+
prompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()
+

Get the prompt for generation.

+
+
+
+
+

Llama2ChatPrompter

+
prompt_strategies.llama2_chat.Llama2ChatPrompter()
+

A prompter that generates prompts for Llama2 models.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.messages.chat.html b/docs/api/prompt_strategies.messages.chat.html new file mode 100644 index 000000000..52a8e471d --- /dev/null +++ b/docs/api/prompt_strategies.messages.chat.html @@ -0,0 +1,910 @@ + + + + + + + + + +prompt_strategies.messages.chat – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.messages.chat

+

prompt_strategies.messages.chat

+

Chat dataset wrapping strategy for new internal messages representations

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
ChatMessageDatasetWrappingStrategyChat dataset wrapping strategy for new internal messages representations
+
+

ChatMessageDatasetWrappingStrategy

+
prompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(
+    self,
+    processor,
+    message_transform=None,
+    formatter=None,
+    **kwargs,
+)
+

Chat dataset wrapping strategy for new internal messages representations

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.metharme.html b/docs/api/prompt_strategies.metharme.html new file mode 100644 index 000000000..87eb561d8 --- /dev/null +++ b/docs/api/prompt_strategies.metharme.html @@ -0,0 +1,920 @@ + + + + + + + + + +prompt_strategies.metharme – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.metharme

+

prompt_strategies.metharme

+

Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
MetharmePromptTokenizingStrategyTokenizing strategy for the Metharme models
MetharmePrompterPrompter for the Metharme models.
+
+

MetharmePromptTokenizingStrategy

+
prompt_strategies.metharme.MetharmePromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for the Metharme models

+
+
+

MetharmePrompter

+
prompt_strategies.metharme.MetharmePrompter(self, *args, **kwargs)
+

Prompter for the Metharme models.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.orcamini.html b/docs/api/prompt_strategies.orcamini.html new file mode 100644 index 000000000..22706a87b --- /dev/null +++ b/docs/api/prompt_strategies.orcamini.html @@ -0,0 +1,912 @@ + + + + + + + + + +prompt_strategies.orcamini – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.orcamini

+

prompt_strategies.orcamini

+

Prompt Strategy for finetuning Orca Mini (v2) models +see also https://huggingface.co/psmathur/orca_mini_v2_7b for more information

+

Use dataset type: orcamini in conig.yml to use this prompt style.

+

Compared to the alpaca_w_system.open_orca dataset type, +this one specifies the system prompt with “### System:”.

+

Not suited/tested for multiple-turn conversations without further adjustments.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
OrcaMiniPrompterAdjusted Prompter for Orca Mini (v2) datasets
+
+

OrcaMiniPrompter

+
prompt_strategies.orcamini.OrcaMiniPrompter(
+    self,
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+

Adjusted Prompter for Orca Mini (v2) datasets

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.orpo.chat_template.html b/docs/api/prompt_strategies.orpo.chat_template.html new file mode 100644 index 000000000..0f2fa4673 --- /dev/null +++ b/docs/api/prompt_strategies.orpo.chat_template.html @@ -0,0 +1,1030 @@ + + + + + + + + + +prompt_strategies.orpo.chat_template – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.orpo.chat_template

+

prompt_strategies.orpo.chat_template

+

chatml prompt tokenization strategy for ORPO

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
Messagemessage/turn
MessageListconversation
ORPODatasetParsingStrategyStrategy to parse chosen rejected dataset into messagelist
ORPOPrompterSingle Turn prompter for ORPO
ORPOTokenizingStrategyrejected_input_ids
+
+

Message

+
prompt_strategies.orpo.chat_template.Message()
+

message/turn

+
+
+

MessageList

+
prompt_strategies.orpo.chat_template.MessageList()
+

conversation

+
+
+

ORPODatasetParsingStrategy

+
prompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()
+

Strategy to parse chosen rejected dataset into messagelist

+
+

Methods

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
get_chosen_conversation_threadDataset structure mappings
get_promptMap the data to extract everything up to the last turn
get_rejected_conversation_threadDataset structure mappings
+
+
get_chosen_conversation_thread
+
prompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(
+    prompt,
+)
+

Dataset structure mappings

+
+
+
get_prompt
+
prompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(
+    prompt,
+)
+

Map the data to extract everything up to the last turn

+
+
+
get_rejected_conversation_thread
+
prompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(
+    prompt,
+)
+

Dataset structure mappings

+
+
+
+
+

ORPOPrompter

+
prompt_strategies.orpo.chat_template.ORPOPrompter(
+    self,
+    chat_template,
+    tokenizer,
+)
+

Single Turn prompter for ORPO

+
+
+

ORPOTokenizingStrategy

+
prompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(
+    self,
+    *args,
+    dataset_parser=None,
+    **kwargs,
+)
+

rejected_input_ids +input_ids +rejected_attention_mask +attention_mask +rejected_labels +labels

+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
loadchatml transforms for datasets with system, input, chosen, rejected
+
+

load

+
prompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)
+

chatml transforms for datasets with system, input, chosen, rejected

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.pygmalion.html b/docs/api/prompt_strategies.pygmalion.html new file mode 100644 index 000000000..f780f2c67 --- /dev/null +++ b/docs/api/prompt_strategies.pygmalion.html @@ -0,0 +1,920 @@ + + + + + + + + + +prompt_strategies.pygmalion – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.pygmalion

+

prompt_strategies.pygmalion

+

Module containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
PygmalionPromptTokenizingStrategyTokenizing strategy for Pygmalion.
PygmalionPrompterPrompter for Pygmalion.
+
+

PygmalionPromptTokenizingStrategy

+
prompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    *args,
+    **kwargs,
+)
+

Tokenizing strategy for Pygmalion.

+
+
+

PygmalionPrompter

+
prompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)
+

Prompter for Pygmalion.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.stepwise_supervised.html b/docs/api/prompt_strategies.stepwise_supervised.html new file mode 100644 index 000000000..0ae1c63c7 --- /dev/null +++ b/docs/api/prompt_strategies.stepwise_supervised.html @@ -0,0 +1,916 @@ + + + + + + + + + +prompt_strategies.stepwise_supervised – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.stepwise_supervised

+

prompt_strategies.stepwise_supervised

+

Module for stepwise datasets, typically including a prompt and reasoning traces, +and (optionally) per-step, or per-prompt-trace labels for reward modelling.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
StepwiseSupervisedPromptTokenizingStrategyTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.
+
+

StepwiseSupervisedPromptTokenizingStrategy

+
prompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(
+    self,
+    tokenizer,
+    sequence_len=2048,
+    step_separator='\n',
+    max_completion_length=None,
+    train_on_last_step_only=False,
+)
+

Tokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning. +These datasets should include the following columns: +- prompt: the prompt text +- completions: a list of n completion steps +- labels: a list of n labels indicating the “correctness” of each step

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_strategies.user_defined.html b/docs/api/prompt_strategies.user_defined.html new file mode 100644 index 000000000..f2590c380 --- /dev/null +++ b/docs/api/prompt_strategies.user_defined.html @@ -0,0 +1,930 @@ + + + + + + + + + +prompt_strategies.user_defined – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_strategies.user_defined

+

prompt_strategies.user_defined

+

User Defined prompts with configuration from the YML config

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
UserDefinedDatasetConfigdataclass configuration representing a userdefined dataset type
UserDefinedPromptTokenizationStrategyPrompt Tokenization Strategy for user defined prompts
+
+

UserDefinedDatasetConfig

+
prompt_strategies.user_defined.UserDefinedDatasetConfig(
+    self,
+    system_prompt='',
+    field_system='system',
+    field_instruction='instruction',
+    field_input='input',
+    field_output='output',
+    format='{instruction} {input} ',
+    no_input_format='{instruction} ',
+    system_format='{system}',
+)
+

dataclass configuration representing a userdefined dataset type

+
+
+

UserDefinedPromptTokenizationStrategy

+
prompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Prompt Tokenization Strategy for user defined prompts

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/prompt_tokenizers.html b/docs/api/prompt_tokenizers.html new file mode 100644 index 000000000..3713be5a8 --- /dev/null +++ b/docs/api/prompt_tokenizers.html @@ -0,0 +1,1132 @@ + + + + + + + + + +prompt_tokenizers – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

prompt_tokenizers

+

prompt_tokenizers

+

Module containing PromptTokenizingStrategy and Prompter classes

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
AlpacaMultipleChoicePromptTokenizingStrategyTokenizing strategy for Alpaca Multiple Choice prompts.
AlpacaPromptTokenizingStrategyTokenizing strategy for Alpaca prompts.
AlpacaReflectionPTStrategyTokenizing strategy for Alpaca Reflection prompts.
DatasetWrappingStrategyAbstract class for wrapping datasets for Chat Messages
GPTeacherPromptTokenizingStrategyTokenizing strategy for GPTeacher prompts.
InstructionPromptTokenizingStrategyTokenizing strategy for instruction-based prompts.
InvalidDataExceptionException raised when the data is invalid
JeopardyPromptTokenizingStrategyTokenizing strategy for Jeopardy prompts.
NomicGPT4AllPromptTokenizingStrategyTokenizing strategy for NomicGPT4All prompts.
OpenAssistantPromptTokenizingStrategyTokenizing strategy for OpenAssistant prompts.
PromptTokenizingStrategyAbstract class for tokenizing strategies
ReflectionPromptTokenizingStrategyTokenizing strategy for Reflection prompts.
SummarizeTLDRPromptTokenizingStrategyTokenizing strategy for SummarizeTLDR prompts.
+
+

AlpacaMultipleChoicePromptTokenizingStrategy

+
prompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for Alpaca Multiple Choice prompts.

+
+
+

AlpacaPromptTokenizingStrategy

+
prompt_tokenizers.AlpacaPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for Alpaca prompts.

+
+
+

AlpacaReflectionPTStrategy

+
prompt_tokenizers.AlpacaReflectionPTStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for Alpaca Reflection prompts.

+
+
+

DatasetWrappingStrategy

+
prompt_tokenizers.DatasetWrappingStrategy()
+

Abstract class for wrapping datasets for Chat Messages

+
+
+

GPTeacherPromptTokenizingStrategy

+
prompt_tokenizers.GPTeacherPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for GPTeacher prompts.

+
+
+

InstructionPromptTokenizingStrategy

+
prompt_tokenizers.InstructionPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for instruction-based prompts.

+
+
+

InvalidDataException

+
prompt_tokenizers.InvalidDataException()
+

Exception raised when the data is invalid

+
+
+

JeopardyPromptTokenizingStrategy

+
prompt_tokenizers.JeopardyPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for Jeopardy prompts.

+
+
+

NomicGPT4AllPromptTokenizingStrategy

+
prompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for NomicGPT4All prompts.

+
+
+

OpenAssistantPromptTokenizingStrategy

+
prompt_tokenizers.OpenAssistantPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for OpenAssistant prompts.

+
+
+

PromptTokenizingStrategy

+
prompt_tokenizers.PromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Abstract class for tokenizing strategies

+
+
+

ReflectionPromptTokenizingStrategy

+
prompt_tokenizers.ReflectionPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for Reflection prompts.

+
+
+

SummarizeTLDRPromptTokenizingStrategy

+
prompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(
+    self,
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+

Tokenizing strategy for SummarizeTLDR prompts.

+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
parse_tokenized_to_resultParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result
tokenize_prompt_defaultReturns the default values for the tokenize prompt function
+
+

parse_tokenized_to_result

+
prompt_tokenizers.parse_tokenized_to_result(
+    result,
+    current_len,
+    res,
+    labels,
+    pad_token_id=None,
+)
+

Parses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result

+
+
+

tokenize_prompt_default

+
prompt_tokenizers.tokenize_prompt_default()
+

Returns the default values for the tokenize prompt function

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/train.html b/docs/api/train.html new file mode 100644 index 000000000..d59ba1f7b --- /dev/null +++ b/docs/api/train.html @@ -0,0 +1,1573 @@ + + + + + + + + + +train – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

train

+

train

+

Prepare and train a model on a dataset. Can also infer from a model or merge lora

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
create_model_cardCreate a model card for the trained model if needed.
determine_resume_checkpointDetermine the checkpoint to resume from based on configuration.
execute_trainingExecute the training process with appropriate SDP kernel configurations.
handle_untrained_tokens_fixApply fixes for untrained tokens if configured.
save_initial_configsSave initial configurations before training.
save_trained_modelSave the trained model according to configuration and training setup.
setup_model_and_tokenizerLoad the tokenizer, processor (for multimodal models), and model based on configuration.
setup_model_and_trainerLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full
setup_model_cardSet up the Axolotl badge and add the Axolotl config to the model card if available.
setup_reference_modelSet up the reference model for RL training if needed.
setup_signal_handlerSet up signal handler for graceful termination.
trainTrain a model on the given dataset.
+
+

create_model_card

+
train.create_model_card(cfg, trainer)
+

Create a model card for the trained model if needed.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
trainerTrainerThe trainer object with model card creation capabilities.required
+
+
+
+

determine_resume_checkpoint

+
train.determine_resume_checkpoint(cfg)
+

Determine the checkpoint to resume from based on configuration.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
str | NonePath to the checkpoint to resume from, or None if not resuming.
+
+
+
+

execute_training

+
train.execute_training(cfg, trainer, resume_from_checkpoint)
+

Execute the training process with appropriate SDP kernel configurations.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
trainerAnyThe configured trainer object.required
resume_from_checkpointstr | NonePath to checkpoint to resume from, if applicable.required
+
+
+
+

handle_untrained_tokens_fix

+
train.handle_untrained_tokens_fix(
+    cfg,
+    model,
+    tokenizer,
+    train_dataset,
+    safe_serialization,
+)
+

Apply fixes for untrained tokens if configured.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
modelPreTrainedModelThe model to apply fixes to.required
tokenizerPreTrainedTokenizerThe tokenizer for token identification.required
train_datasetDatasetThe training dataset to use.required
safe_serializationboolWhether to use safe serialization when saving.required
+
+
+
+

save_initial_configs

+
train.save_initial_configs(cfg, tokenizer, model, peft_config)
+

Save initial configurations before training.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
tokenizerPreTrainedTokenizerThe tokenizer to save.required
modelPreTrainedModelThe model to save configuration for.required
peft_configPeftConfig | NoneThe PEFT configuration to save if applicable.required
+
+
+
+

save_trained_model

+
train.save_trained_model(cfg, trainer, model, safe_serialization)
+

Save the trained model according to configuration and training setup.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
trainerAnyThe trainer object.required
modelPreTrainedModelThe trained model to save.required
safe_serializationboolWhether to use safe serialization.required
+
+
+
+

setup_model_and_tokenizer

+
train.setup_model_and_tokenizer(cfg)
+

Load the tokenizer, processor (for multimodal models), and model based on configuration.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]Tuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).
+
+
+
+

setup_model_and_trainer

+
train.setup_model_and_trainer(cfg, dataset_meta)
+

Load model, tokenizer, trainer, etc. Helper function to encapsulate the full +trainer setup.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultThe configuration dictionary with training parameters.required
dataset_metaTrainDatasetMetaObject with training, validation datasets and metadata.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[HFRLTrainerBuilder | HFCausalTrainerBuilder, PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None]Tuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config
+
+
+
+

setup_model_card

+
train.setup_model_card(cfg)
+

Set up the Axolotl badge and add the Axolotl config to the model card if available.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
+
+
+
+

setup_reference_model

+
train.setup_reference_model(cfg, tokenizer)
+

Set up the reference model for RL training if needed.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
tokenizerPreTrainedTokenizerThe tokenizer to use for the reference model.required
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
PreTrainedModel | NoneReference model if needed for RL training, None otherwise.
+
+
+
+

setup_signal_handler

+
train.setup_signal_handler(cfg, model, safe_serialization)
+

Set up signal handler for graceful termination.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultDictionary mapping axolotl config keys to values.required
modelPreTrainedModelThe model to save on terminationrequired
safe_serializationboolWhether to use safe serialization when savingrequired
+
+
+
+

train

+
train.train(cfg, dataset_meta)
+

Train a model on the given dataset.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgDictDefaultThe configuration dictionary with training parametersrequired
dataset_metaTrainDatasetMetaObject with training, validation datasets and metadatarequired
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]Tuple of (model, tokenizer) after training
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.bench.html b/docs/api/utils.bench.html new file mode 100644 index 000000000..d69400323 --- /dev/null +++ b/docs/api/utils.bench.html @@ -0,0 +1,907 @@ + + + + + + + + + +utils.bench – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.bench

+

utils.bench

+

Benchmarking and measurement utilities

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
check_cuda_devicewraps a function and returns the default value instead of running the
+
+

check_cuda_device

+
utils.bench.check_cuda_device(default_value)
+

wraps a function and returns the default value instead of running the +wrapped function if cuda isn’t available or the device is auto +:param default_value: +:return:

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.callbacks.comet_.html b/docs/api/utils.callbacks.comet_.html new file mode 100644 index 000000000..e446bbb98 --- /dev/null +++ b/docs/api/utils.callbacks.comet_.html @@ -0,0 +1,907 @@ + + + + + + + + + +utils.callbacks.comet_ – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.callbacks.comet_

+

utils.callbacks.comet_

+

Comet module for trainer callbacks

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
SaveAxolotlConfigtoCometCallbackCallback to save axolotl config to comet
+
+

SaveAxolotlConfigtoCometCallback

+
utils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(
+    self,
+    axolotl_config_path,
+)
+

Callback to save axolotl config to comet

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.callbacks.lisa.html b/docs/api/utils.callbacks.lisa.html new file mode 100644 index 000000000..5a490eeee --- /dev/null +++ b/docs/api/utils.callbacks.lisa.html @@ -0,0 +1,845 @@ + + + + + + + + + +utils.callbacks.lisa – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.callbacks.lisa

+

utils.callbacks.lisa

+

module for LISA

+

Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl +Arxiv: https://arxiv.org/abs/2403.17919 +License: Apache 2.0

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.callbacks.mlflow_.html b/docs/api/utils.callbacks.mlflow_.html new file mode 100644 index 000000000..6c26e5ef6 --- /dev/null +++ b/docs/api/utils.callbacks.mlflow_.html @@ -0,0 +1,907 @@ + + + + + + + + + +utils.callbacks.mlflow_ – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.callbacks.mlflow_

+

utils.callbacks.mlflow_

+

MLFlow module for trainer callbacks

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
SaveAxolotlConfigtoMlflowCallbackCallback to save axolotl config to mlflow
+
+

SaveAxolotlConfigtoMlflowCallback

+
utils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(
+    self,
+    axolotl_config_path,
+)
+

Callback to save axolotl config to mlflow

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.callbacks.perplexity.html b/docs/api/utils.callbacks.perplexity.html new file mode 100644 index 000000000..f3f11ac1c --- /dev/null +++ b/docs/api/utils.callbacks.perplexity.html @@ -0,0 +1,927 @@ + + + + + + + + + +utils.callbacks.perplexity – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.callbacks.perplexity

+

utils.callbacks.perplexity

+

callback to calculate perplexity as an evaluation metric.

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
PerplexityCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.
+
+

Perplexity

+
utils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)
+

Calculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity. +This is a custom variant that doesn’t re-tokenize the input or re-load the model.

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
computeCompute perplexity in a fixed length sliding window across the sequence.
+
+
compute
+
utils.callbacks.perplexity.Perplexity.compute(model, references=None)
+

Compute perplexity in a fixed length sliding window across the sequence.

+ + +
+
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.callbacks.profiler.html b/docs/api/utils.callbacks.profiler.html new file mode 100644 index 000000000..0f2b754fe --- /dev/null +++ b/docs/api/utils.callbacks.profiler.html @@ -0,0 +1,904 @@ + + + + + + + + + +utils.callbacks.profiler – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.callbacks.profiler

+

utils.callbacks.profiler

+

HF Trainer callback for creating pytorch profiling snapshots

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
PytorchProfilerCallbackPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.
+
+

PytorchProfilerCallback

+
utils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)
+

PyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.chat_templates.html b/docs/api/utils.chat_templates.html new file mode 100644 index 000000000..d52679959 --- /dev/null +++ b/docs/api/utils.chat_templates.html @@ -0,0 +1,1028 @@ + + + + + + + + + +utils.chat_templates – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.chat_templates

+

utils.chat_templates

+

This module provides functionality for selecting chat templates based on user choices. +These templates are used for formatting messages in a conversation.

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
get_chat_templateFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.
register_chat_templateRegisters chat templates.
+
+

get_chat_template

+
utils.chat_templates.get_chat_template(
+    user_choice,
+    jinja_template=None,
+    tokenizer=None,
+)
+

Finds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
user_choicestrThe user’s choice of template.required
jinja_templateOptional[str]The jinja template string. Defaults to None.None
tokenizerOptional[PreTrainedTokenizerBase]The tokenizer. Defaults to None.None
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
strstrThe chosen template string.
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf the user_choice is not found in the templates.
+
+
+
+

register_chat_template

+
utils.chat_templates.register_chat_template(template_name, chat_template)
+

Registers chat templates.

+
+

Parameters

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
template_namestrThe name of the template.required
chat_templatestrThe template string.required
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.collators.batching.html b/docs/api/utils.collators.batching.html new file mode 100644 index 000000000..6da5a7219 --- /dev/null +++ b/docs/api/utils.collators.batching.html @@ -0,0 +1,1140 @@ + + + + + + + + + +utils.collators.batching – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.collators.batching

+

utils.collators.batching

+

Data collators for axolotl to pad labels and position_ids for packed sequences. Also +includes logic for handling sequence parallelism collation.

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
BatchSamplerDataCollatorForSeq2SeqCollator for multipack specific to the using the BatchSampler
DataCollatorForSeq2SeqData collator that will dynamically pad the inputs received, as well as the labels and position_ids
PretrainingBatchSamplerDataCollatorForSeq2SeqCollator for multipack specific to the using the BatchSampler
V2BatchSamplerDataCollatorForSeq2SeqCollator for multipack specific to the using the BatchSampler
+
+

BatchSamplerDataCollatorForSeq2Seq

+
utils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(
+    self,
+    tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+    position_pad_token_id=0,
+    return_tensors='pt',
+    sequence_parallel_degree=1,
+)
+

Collator for multipack specific to the using the BatchSampler

+
+
+

DataCollatorForSeq2Seq

+
utils.collators.batching.DataCollatorForSeq2Seq(
+    self,
+    tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+    position_pad_token_id=0,
+    return_tensors='pt',
+    sequence_parallel_degree=1,
+)
+

Data collator that will dynamically pad the inputs received, as well as the labels and position_ids

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
tokenizer[PreTrainedTokenizer] or [PreTrainedTokenizerFast]The tokenizer used for encoding the data.required
model[PreTrainedModel]The model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.None
paddingbool, str or [~utils.PaddingStrategy], optional, defaults to TrueSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).True
max_lengthint, optionalMaximum length of the returned list and optionally padding length (see above).None
pad_to_multiple_ofint, optionalIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).None
label_pad_token_idint, optional, defaults to -100The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).-100
return_tensorsstrThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.'pt'
sequence_parallel_degreeintThe degree of sequence parallelism. Default to 1 for no sequence parallelism.1
+
+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
apply_sequence_parallelismApply sequence parallelism slicing to a batch.
+
+
apply_sequence_parallelism
+
utils.collators.batching.DataCollatorForSeq2Seq.apply_sequence_parallelism(
+    batch,
+)
+

Apply sequence parallelism slicing to a batch.

+
+
Parameters
+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
batchdict[str, torch.Tensor]Batch dictionary from parent collator.required
+
+
+
Returns
+ + + + + + + + + + + + + + + +
NameTypeDescription
torch.TensorSliced batch dictionary.
+
+
+
+
+
+

PretrainingBatchSamplerDataCollatorForSeq2Seq

+
utils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(
+    self,
+    *args,
+    multipack_attn=True,
+    **kwargs,
+)
+

Collator for multipack specific to the using the BatchSampler

+
+
+

V2BatchSamplerDataCollatorForSeq2Seq

+
utils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(
+    self,
+    tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+    position_pad_token_id=0,
+    return_tensors='pt',
+    sequence_parallel_degree=1,
+)
+

Collator for multipack specific to the using the BatchSampler

+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
adjust_position_ids_for_sliceAdjust position IDs for a sliced sequence to maintain proper relative positions.
+
+

adjust_position_ids_for_slice

+
utils.collators.batching.adjust_position_ids_for_slice(position_ids, start_idx)
+

Adjust position IDs for a sliced sequence to maintain proper relative positions. +This handles the case where position IDs might not be contiguous due to sample +packing.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.collators.core.html b/docs/api/utils.collators.core.html new file mode 100644 index 000000000..6ed11f9d9 --- /dev/null +++ b/docs/api/utils.collators.core.html @@ -0,0 +1,842 @@ + + + + + + + + + +utils.collators.core – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.collators.core

+

utils.collators.core

+

basic shared collator constants

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.collators.mamba.html b/docs/api/utils.collators.mamba.html new file mode 100644 index 000000000..36712c8a0 --- /dev/null +++ b/docs/api/utils.collators.mamba.html @@ -0,0 +1,904 @@ + + + + + + + + + +utils.collators.mamba – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.collators.mamba

+

utils.collators.mamba

+

collators for Mamba

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
MambaDataCollatorCollator for State Space Models (Mamba)
+
+

MambaDataCollator

+
utils.collators.mamba.MambaDataCollator(self, tokenizer)
+

Collator for State Space Models (Mamba)

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.collators.mm_chat.html b/docs/api/utils.collators.mm_chat.html new file mode 100644 index 000000000..a136e354e --- /dev/null +++ b/docs/api/utils.collators.mm_chat.html @@ -0,0 +1,914 @@ + + + + + + + + + +utils.collators.mm_chat – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.collators.mm_chat

+

utils.collators.mm_chat

+

Collators for multi-modal chat messages and packing

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
MultiModalChatDataCollatorCollator for multi-modal chat messages
+
+

MultiModalChatDataCollator

+
utils.collators.mm_chat.MultiModalChatDataCollator(
+    self,
+    tokenizer,
+    processor,
+    return_tensors='pt',
+    chat_template=None,
+    packing=False,
+    max_images=-1,
+    padding=True,
+    pad_to_multiple_of=None,
+)
+

Collator for multi-modal chat messages

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.data.pretraining.html b/docs/api/utils.data.pretraining.html new file mode 100644 index 000000000..182dab54f --- /dev/null +++ b/docs/api/utils.data.pretraining.html @@ -0,0 +1,842 @@ + + + + + + + + + +utils.data.pretraining – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.data.pretraining

+

utils.data.pretraining

+

data handling specific to pretraining

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.data.sft.html b/docs/api/utils.data.sft.html new file mode 100644 index 000000000..0aefd613d --- /dev/null +++ b/docs/api/utils.data.sft.html @@ -0,0 +1,842 @@ + + + + + + + + + +utils.data.sft – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.data.sft

+

utils.data.sft

+

data handling specific to SFT

+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.dict.html b/docs/api/utils.dict.html new file mode 100644 index 000000000..9e20e2563 --- /dev/null +++ b/docs/api/utils.dict.html @@ -0,0 +1,904 @@ + + + + + + + + + +utils.dict – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.dict

+

utils.dict

+

Module containing the DictDefault class

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
DictDefaultA Dict that returns None instead of returning empty Dict for missing keys.
+
+

DictDefault

+
utils.dict.DictDefault()
+

A Dict that returns None instead of returning empty Dict for missing keys.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.distributed.html b/docs/api/utils.distributed.html new file mode 100644 index 000000000..34a8498dc --- /dev/null +++ b/docs/api/utils.distributed.html @@ -0,0 +1,1011 @@ + + + + + + + + + +utils.distributed – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.distributed

+

utils.distributed

+

utility helpers for distributed checks

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
barrierActs as a barrier to wait for all processes. This ensures that all processes
compute_and_broadcastCompute a value using the function ‘fn’ only on the specified rank (default is 0).
gather_from_all_ranksRun a callable ‘fn’ on all ranks and gather the results on the specified rank.
gather_scalar_from_all_ranksRun a callable ‘fn’ on all ranks and gather the results on the specified rank.
is_distributedCheck if distributed training is initialized.
is_main_processCheck if the current process is the main process.
reduce_and_broadcastRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,
zero_firstruns the wrapped context so that rank 0 runs first before other ranks
zero_onlyContext manager that only runs the enclosed block on the main rank.
+
+

barrier

+
utils.distributed.barrier()
+

Acts as a barrier to wait for all processes. This ensures that all processes +reach the barrier before proceeding further.

+
+
+

compute_and_broadcast

+
utils.distributed.compute_and_broadcast(fn)
+

Compute a value using the function ‘fn’ only on the specified rank (default is 0). +The value is then broadcasted to all other ranks.

+

Args: +- fn (callable): A function that computes the value. This should not have any side effects. +- rank (int, optional): The rank that computes the value. Default is 0.

+

Returns: +- The computed value (int or float).

+
+
+

gather_from_all_ranks

+
utils.distributed.gather_from_all_ranks(fn, world_size=1)
+

Run a callable ‘fn’ on all ranks and gather the results on the specified rank.

+

Args: +- fn (callable): A function that computes the value. This should not have any side effects. +- rank (int, optional): The rank that gathers the values. Default is 0. +- world_size (int, optional): Total number of processes in the current distributed setup.

+

Returns: +- A list of computed values from all ranks if on the gathering rank, otherwise None.

+
+
+

gather_scalar_from_all_ranks

+
utils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)
+

Run a callable ‘fn’ on all ranks and gather the results on the specified rank.

+

Args: +- fn (callable): A function that computes the value. This should not have any side effects. +- rank (int, optional): The rank that gathers the values. Default is 0. +- world_size (int, optional): Total number of processes in the current distributed setup.

+

Returns: +- A list of computed values from all ranks if on the gathering rank, otherwise None.

+
+
+

is_distributed

+
utils.distributed.is_distributed()
+

Check if distributed training is initialized.

+
+
+

is_main_process

+
utils.distributed.is_main_process()
+

Check if the current process is the main process. +If not in distributed mode, always return True.

+
+
+

reduce_and_broadcast

+
utils.distributed.reduce_and_broadcast(fn1, fn2)
+

Run a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’, +and then broadcast the reduced result to all ranks.

+

Args: +- fn1 (callable): A function that computes the value on each rank. +- fn2 (callable): A reduction function that takes a list of values and returns a single value. +- world_size (int, optional): Total number of processes in the current distributed setup.

+

Returns: +- The reduced and broadcasted value.

+
+
+

zero_first

+
utils.distributed.zero_first(is_main)
+

runs the wrapped context so that rank 0 runs first before other ranks

+
+
+

zero_only

+
utils.distributed.zero_only()
+

Context manager that only runs the enclosed block on the main rank.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.freeze.html b/docs/api/utils.freeze.html new file mode 100644 index 000000000..d11cf406c --- /dev/null +++ b/docs/api/utils.freeze.html @@ -0,0 +1,995 @@ + + + + + + + + + +utils.freeze – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.freeze

+

utils.freeze

+

module to freeze/unfreeze parameters by name

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
LayerNamePatternRepresents a regex pattern for layer names, potentially including a parameter index range.
+
+

LayerNamePattern

+
utils.freeze.LayerNamePattern(self, pattern)
+

Represents a regex pattern for layer names, potentially including a parameter index range.

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
matchChecks if the given layer name matches the regex pattern.
+
+
match
+
utils.freeze.LayerNamePattern.match(name)
+

Checks if the given layer name matches the regex pattern.

+

Parameters: +- name (str): The layer name to check.

+

Returns: +- bool: True if the layer name matches the pattern, False otherwise.

+
+
+
+
+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
freeze_layers_exceptFreezes all layers of the given model except for the layers that match given regex patterns.
+
+

freeze_layers_except

+
utils.freeze.freeze_layers_except(model, regex_patterns)
+

Freezes all layers of the given model except for the layers that match given regex patterns. +Periods in the patterns are treated as literal periods, not as wildcard characters.

+

Parameters: +- model (nn.Module): The PyTorch model to be modified. +- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen. +Note that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names. +Also, to match the entire layer name, the pattern should start with “^” and end with “\(", otherwise it will match any part of the layer name. + The range pattern part is optional and it is not compiled as a regex pattern which means you must put "\)” before the range pattern if you want to match the entire layer name. +E.g., [“^model.embed_tokens.weight\([:32000]", "layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\)”]

+

Returns: +None; the model is modified in place.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.gradient_checkpointing.unsloth.html b/docs/api/utils.gradient_checkpointing.unsloth.html new file mode 100644 index 000000000..9a62ae109 --- /dev/null +++ b/docs/api/utils.gradient_checkpointing.unsloth.html @@ -0,0 +1,905 @@ + + + + + + + + + +utils.gradient_checkpointing.unsloth – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.gradient_checkpointing.unsloth

+

utils.gradient_checkpointing.unsloth

+

Unsloth checkpointing

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
Unsloth_Offloaded_Gradient_CheckpointerSaves VRAM by smartly offloading to RAM.
+
+

Unsloth_Offloaded_Gradient_Checkpointer

+
utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer()
+

Saves VRAM by smartly offloading to RAM. +Tiny hit to performance, since we mask the movement via non blocking calls.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.lora.html b/docs/api/utils.lora.html new file mode 100644 index 000000000..edb2126b7 --- /dev/null +++ b/docs/api/utils.lora.html @@ -0,0 +1,951 @@ + + + + + + + + + +utils.lora – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.lora

+

utils.lora

+

module to get the state dict of a merged lora model

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
get_lora_merged_state_dictCreate and return a state_dict that has the LoRA deltas
+
+

get_lora_merged_state_dict

+
utils.lora.get_lora_merged_state_dict(model)
+

Create and return a state_dict that has the LoRA deltas +merged into the base model’s weights, without modifying model in place.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
modeltorch.nn.ModuleA model that has LoRA/PEFT adapters attached.required
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
dictdictA state_dict of the merged parameters.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.lora_embeddings.html b/docs/api/utils.lora_embeddings.html new file mode 100644 index 000000000..d7ad68430 --- /dev/null +++ b/docs/api/utils.lora_embeddings.html @@ -0,0 +1,904 @@ + + + + + + + + + +utils.lora_embeddings – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.lora_embeddings

+

utils.lora_embeddings

+

helpers for lora embeddings

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
get_linear_embedding_layersreturns the linear embedding layers needed for loras, dependent on the model arch
+
+

get_linear_embedding_layers

+
utils.lora_embeddings.get_linear_embedding_layers(model_type)
+

returns the linear embedding layers needed for loras, dependent on the model arch

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.model_shard_quant.html b/docs/api/utils.model_shard_quant.html new file mode 100644 index 000000000..a9402c15e --- /dev/null +++ b/docs/api/utils.model_shard_quant.html @@ -0,0 +1,916 @@ + + + + + + + + + +utils.model_shard_quant – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.model_shard_quant

+

utils.model_shard_quant

+

module to handle loading model on cpu/meta device for FSDP

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
load_and_quantizeLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.
+
+

load_and_quantize

+
utils.model_shard_quant.load_and_quantize(
+    module,
+    name,
+    value,
+    device=None,
+    dtype=None,
+    skip_names=None,
+    to_cpu=False,
+    to_meta=False,
+    verbose=False,
+    quant_method='bnb',
+)
+

Loads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.

+

Quantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.models.html b/docs/api/utils.models.html new file mode 100644 index 000000000..6468ce1c9 --- /dev/null +++ b/docs/api/utils.models.html @@ -0,0 +1,1158 @@ + + + + + + + + + +utils.models – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.models

+

utils.models

+

Module for models and model loading

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
ModelLoaderModelLoader: managing all the config and monkey patches while loading model
+
+

ModelLoader

+
utils.models.ModelLoader(
+    self,
+    cfg,
+    tokenizer,
+    *,
+    processor=None,
+    inference=False,
+    reference_model=False,
+    **kwargs,
+)
+

ModelLoader: managing all the config and monkey patches while loading model

+
+

Attributes

+ + + + + + + + + + + + + +
NameDescription
has_flash_attnCheck if flash attention is installed
+
+
+

Methods

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
patch_llama_derived_modelModify all llama derived models in one block
patch_loss_llamaPatch loss functions and other optimizations
set_attention_configsample packing uses custom FA2 patch
set_auto_model_loaderSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM
+
+
patch_llama_derived_model
+
utils.models.ModelLoader.patch_llama_derived_model()
+

Modify all llama derived models in one block

+
+
+
patch_loss_llama
+
utils.models.ModelLoader.patch_loss_llama()
+

Patch loss functions and other optimizations

+
+
+
set_attention_config
+
utils.models.ModelLoader.set_attention_config()
+

sample packing uses custom FA2 patch

+
+
+
set_auto_model_loader
+
utils.models.ModelLoader.set_auto_model_loader()
+

Set self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM +(set at __init__). When using a multimodal model, self.auto_model_loader +should be set according to the type of the model.

+
+
+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
get_module_class_from_nameGets a class from a module by its name.
load_modelLoad a model for a given configuration and tokenizer.
load_tokenizerLoad and configure the tokenizer based on the provided config.
modify_tokenizer_filesModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.
setup_quantized_meta_for_peftReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device
setup_quantized_peft_meta_for_trainingReplaces dummy quant_state.to method with the original function to allow training to continue
+
+

get_module_class_from_name

+
utils.models.get_module_class_from_name(module, name)
+

Gets a class from a module by its name.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
moduletorch.nn.ModuleThe module to get the class from.required
namestrThe name of the class.required
+
+
+
+

load_model

+
utils.models.load_model(
+    cfg,
+    tokenizer,
+    *,
+    processor=None,
+    inference=False,
+    reference_model=False,
+    **kwargs,
+)
+

Load a model for a given configuration and tokenizer.

+
+
+

load_tokenizer

+
utils.models.load_tokenizer(cfg)
+

Load and configure the tokenizer based on the provided config.

+
+
+

modify_tokenizer_files

+
utils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)
+

Modify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.

+

This only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
tokenizer_pathstrPath or name of the original tokenizerrequired
token_mappingsDict[int, str]Dict mapping {token_id (int): new_token_string}required
output_dirstrDirectory to save the modified tokenizerrequired
+
+
+

Returns

+ + + + + + + + + + + + + + + +
NameTypeDescription
strPath to the modified tokenizer directory
+

Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941

+
+
+
+

setup_quantized_meta_for_peft

+
utils.models.setup_quantized_meta_for_peft(model)
+

Replaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device

+
+
+

setup_quantized_peft_meta_for_training

+
utils.models.setup_quantized_peft_meta_for_training(model)
+

Replaces dummy quant_state.to method with the original function to allow training to continue

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.optimizers.adopt.html b/docs/api/utils.optimizers.adopt.html new file mode 100644 index 000000000..be0aecbb6 --- /dev/null +++ b/docs/api/utils.optimizers.adopt.html @@ -0,0 +1,928 @@ + + + + + + + + + +utils.optimizers.adopt – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.optimizers.adopt

+

utils.optimizers.adopt

+

Copied from https://github.com/iShohei220/adopt

+

ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024) +Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
adoptFunctional API that performs ADOPT algorithm computation.
+
+

adopt

+
utils.optimizers.adopt.adopt(
+    params,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    state_steps,
+    foreach=None,
+    capturable=False,
+    differentiable=False,
+    fused=None,
+    grad_scale=None,
+    found_inf=None,
+    has_complex=False,
+    *,
+    beta1,
+    beta2,
+    lr,
+    clip_lambda,
+    weight_decay,
+    decouple,
+    eps,
+    maximize,
+)
+

Functional API that performs ADOPT algorithm computation.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html new file mode 100644 index 000000000..bd24e0ba3 --- /dev/null +++ b/docs/api/utils.samplers.multipack.html @@ -0,0 +1,914 @@ + + + + + + + + + +utils.samplers.multipack – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.samplers.multipack

+

utils.samplers.multipack

+

Multipack Batch Sampler

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
MultipackBatchSamplerBatch sampler class for multipack
+
+

MultipackBatchSampler

+
utils.samplers.multipack.MultipackBatchSampler(
+    self,
+    sampler,
+    batch_size,
+    batch_max_len,
+    lengths,
+    packing_efficiency_estimate=1.0,
+    drop_last=False,
+    num_count_samples=16,
+    **kwargs,
+)
+

Batch sampler class for multipack

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schedulers.html b/docs/api/utils.schedulers.html new file mode 100644 index 000000000..8271cfb1d --- /dev/null +++ b/docs/api/utils.schedulers.html @@ -0,0 +1,1177 @@ + + + + + + + + + +utils.schedulers – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schedulers

+

utils.schedulers

+

Module for custom LRScheduler class

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
InterpolatingLogSchedulerA scheduler that interpolates learning rates in a logarithmic fashion
RexLRReflected Exponential (REX) learning rate scheduler.
+
+

InterpolatingLogScheduler

+
utils.schedulers.InterpolatingLogScheduler(
+    self,
+    optimizer,
+    num_steps,
+    min_lr,
+    max_lr,
+    last_epoch=-1,
+)
+

A scheduler that interpolates learning rates in a logarithmic fashion

+
+
+

RexLR

+
utils.schedulers.RexLR(
+    self,
+    optimizer,
+    max_lr,
+    min_lr,
+    total_steps=0,
+    num_warmup_steps=0,
+    last_step=0,
+)
+

Reflected Exponential (REX) learning rate scheduler.

+
    +
  • Original implementation: https://github.com/IvanVassi/REX_LR
  • +
  • Original license: Apache 2.0
  • +
  • Based on: https://arxiv.org/abs/2107.04197
  • +
+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
optimizertorch.optim.OptimizerThe optimizer to schedule the learning rate for.required
max_lrfloatThe maximum learning rate.required
min_lrfloatThe minimum learning rate.required
total_stepsintThe total number of training steps.0
num_warmup_stepsintThe number of warmup steps.0
last_stepintThe index of last step.0
+
+
+
+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
get_cosine_schedule_with_min_lr
get_cosine_schedule_with_quadratic_warmupCreate a schedule with a learning rate that decreases following the values of the cosine function between the
get_cosine_schedule_with_warmup_decay_constantImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)
+
+

get_cosine_schedule_with_min_lr

+
utils.schedulers.get_cosine_schedule_with_min_lr(
+    optimizer,
+    num_warmup_steps,
+    num_training_steps,
+    min_lr_ratio=0.0,
+)
+
+

Create a learning rate schedule which has

+
    +
  • linear warmup from 0 -> max_lr over num_warmup_steps
  • +
  • cosine learning rate annealing from max_lr -> min_lr over num_training_steps
  • +
+
+
+
+

get_cosine_schedule_with_quadratic_warmup

+
utils.schedulers.get_cosine_schedule_with_quadratic_warmup(
+    optimizer,
+    num_warmup_steps,
+    num_training_steps,
+    num_cycles=0.5,
+    last_epoch=-1,
+)
+

Create a schedule with a learning rate that decreases following the values of the cosine function between the +initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the +initial lr set in the optimizer.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
optimizer[~torch.optim.Optimizer]The optimizer for which to schedule the learning rate.required
num_warmup_stepsintThe number of steps for the warmup phase.required
num_training_stepsintThe total number of training steps.required
num_cyclesfloat, optional, defaults to 0.5The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).0.5
last_epochint, optional, defaults to -1The index of the last epoch when resuming training.-1
+
+
+

Return

+

torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.

+
+
+
+

get_cosine_schedule_with_warmup_decay_constant

+
utils.schedulers.get_cosine_schedule_with_warmup_decay_constant(
+    optimizer,
+    num_warmup_steps,
+    num_training_steps,
+    constant_lr_ratio,
+    min_lr_ratio,
+    num_cycles=0.5,
+    last_epoch=-1,
+)
+

Implementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf) +Create a schedule with a learning rate that decreases following the values of the cosine function between the +initial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate +, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
optimizer[~torch.optim.Optimizer]The optimizer for which to schedule the learning rate.required
num_warmup_stepsintThe number of steps for the warmup phase.required
num_training_stepsintThe total number of training steps.required
constant_lr_ratiofloat(float): The ratio of num_training_steps to decrease by cosine function.required
min_lr_ratiofloat(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate. | _required_ | | num_cycles |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch |int, *optional*, defaults to -1 | The index of the last epoch when resuming training. |-1`
+
+
+

Return

+

torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.

+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.config.html b/docs/api/utils.schemas.config.html new file mode 100644 index 000000000..8084f6ba1 --- /dev/null +++ b/docs/api/utils.schemas.config.html @@ -0,0 +1,914 @@ + + + + + + + + + +utils.schemas.config – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.config

+

utils.schemas.config

+

Module with Pydantic models for configuration.

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
AxolotlConfigWCapabilitieswrapper to valdiate gpu capabilities with the configured options
AxolotlInputConfigWrapper of all config options
+
+

AxolotlConfigWCapabilities

+
utils.schemas.config.AxolotlConfigWCapabilities()
+

wrapper to valdiate gpu capabilities with the configured options

+
+
+

AxolotlInputConfig

+
utils.schemas.config.AxolotlInputConfig()
+

Wrapper of all config options

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.datasets.html b/docs/api/utils.schemas.datasets.html new file mode 100644 index 000000000..949adeab9 --- /dev/null +++ b/docs/api/utils.schemas.datasets.html @@ -0,0 +1,996 @@ + + + + + + + + + +utils.schemas.datasets – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.datasets

+

utils.schemas.datasets

+

Pydantic models for datasets-related configuration

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
DPODatasetDPO configuration subset
KTODatasetKTO configuration subset
PretrainingDatasetPretraining dataset configuration subset
SFTDatasetSFT configuration subset
StepwiseSupervisedDatasetStepwise supervised dataset configuration subset
UserDefinedDPOTypeUser defined typing for DPO
UserDefinedKTOTypeUser defined typing for KTO
UserDefinedPrompterTypeStructure for user defined prompt types
+
+

DPODataset

+
utils.schemas.datasets.DPODataset()
+

DPO configuration subset

+
+
+

KTODataset

+
utils.schemas.datasets.KTODataset()
+

KTO configuration subset

+
+
+

PretrainingDataset

+
utils.schemas.datasets.PretrainingDataset()
+

Pretraining dataset configuration subset

+
+
+

SFTDataset

+
utils.schemas.datasets.SFTDataset()
+

SFT configuration subset

+
+

Methods

+ + + + + + + + + + + + + +
NameDescription
handle_legacy_message_fieldsHandle backwards compatibility between legacy message field mapping and new property mapping system.
+
+
handle_legacy_message_fields
+
utils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)
+

Handle backwards compatibility between legacy message field mapping and new property mapping system.

+
+
+
+
+

StepwiseSupervisedDataset

+
utils.schemas.datasets.StepwiseSupervisedDataset()
+

Stepwise supervised dataset configuration subset

+
+
+

UserDefinedDPOType

+
utils.schemas.datasets.UserDefinedDPOType()
+

User defined typing for DPO

+
+
+

UserDefinedKTOType

+
utils.schemas.datasets.UserDefinedKTOType()
+

User defined typing for KTO

+
+
+

UserDefinedPrompterType

+
utils.schemas.datasets.UserDefinedPrompterType()
+

Structure for user defined prompt types

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.enums.html b/docs/api/utils.schemas.enums.html new file mode 100644 index 000000000..30762e866 --- /dev/null +++ b/docs/api/utils.schemas.enums.html @@ -0,0 +1,924 @@ + + + + + + + + + +utils.schemas.enums – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.enums

+

utils.schemas.enums

+

Enums for Axolotl input config

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
ChatTemplateChat templates configuration subset
CustomSupportedOptimizersCustom supported optimizers
RLTypeRL trainer type configuration subset
+
+

ChatTemplate

+
utils.schemas.enums.ChatTemplate()
+

Chat templates configuration subset

+
+
+

CustomSupportedOptimizers

+
utils.schemas.enums.CustomSupportedOptimizers()
+

Custom supported optimizers

+
+
+

RLType

+
utils.schemas.enums.RLType()
+

RL trainer type configuration subset

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.integrations.html b/docs/api/utils.schemas.integrations.html new file mode 100644 index 000000000..ef409fd68 --- /dev/null +++ b/docs/api/utils.schemas.integrations.html @@ -0,0 +1,954 @@ + + + + + + + + + +utils.schemas.integrations – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.integrations

+

utils.schemas.integrations

+

Pydantic models for Axolotl integrations

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
CometConfigComet configuration subset
GradioConfigGradio configuration subset
LISAConfigLISA configuration subset
MLFlowConfigMLFlow configuration subset
RayConfigRay launcher configuration subset
WandbConfigWandb configuration subset
+
+

CometConfig

+
utils.schemas.integrations.CometConfig()
+

Comet configuration subset

+
+
+

GradioConfig

+
utils.schemas.integrations.GradioConfig()
+

Gradio configuration subset

+
+
+

LISAConfig

+
utils.schemas.integrations.LISAConfig()
+

LISA configuration subset

+
+
+

MLFlowConfig

+
utils.schemas.integrations.MLFlowConfig()
+

MLFlow configuration subset

+
+
+

RayConfig

+
utils.schemas.integrations.RayConfig()
+

Ray launcher configuration subset

+
+
+

WandbConfig

+
utils.schemas.integrations.WandbConfig()
+

Wandb configuration subset

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.model.html b/docs/api/utils.schemas.model.html new file mode 100644 index 000000000..aa7ff3872 --- /dev/null +++ b/docs/api/utils.schemas.model.html @@ -0,0 +1,924 @@ + + + + + + + + + +utils.schemas.model – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.model

+

utils.schemas.model

+

Pydantic models for model input / output, etc. configuration

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + +
NameDescription
ModelInputConfigModel configuration subset
ModelOutputConfigmodel save configuration subset
SpecialTokensConfigSpecial tokens configuration subset
+
+

ModelInputConfig

+
utils.schemas.model.ModelInputConfig()
+

Model configuration subset

+
+
+

ModelOutputConfig

+
utils.schemas.model.ModelOutputConfig()
+

model save configuration subset

+
+
+

SpecialTokensConfig

+
utils.schemas.model.SpecialTokensConfig()
+

Special tokens configuration subset

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.peft.html b/docs/api/utils.schemas.peft.html new file mode 100644 index 000000000..7d825907f --- /dev/null +++ b/docs/api/utils.schemas.peft.html @@ -0,0 +1,934 @@ + + + + + + + + + +utils.schemas.peft – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.peft

+

utils.schemas.peft

+

Pydantic models for PEFT-related configuration

+
+

Classes

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
LoftQConfigLoftQ configuration subset
LoraConfigPeft / LoRA configuration subset
PeftConfigpeftq configuration subset
ReLoRAConfigReLoRA configuration subset
+
+

LoftQConfig

+
utils.schemas.peft.LoftQConfig()
+

LoftQ configuration subset

+
+
+

LoraConfig

+
utils.schemas.peft.LoraConfig()
+

Peft / LoRA configuration subset

+
+
+

PeftConfig

+
utils.schemas.peft.PeftConfig()
+

peftq configuration subset

+
+
+

ReLoRAConfig

+
utils.schemas.peft.ReLoRAConfig()
+

ReLoRA configuration subset

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.training.html b/docs/api/utils.schemas.training.html new file mode 100644 index 000000000..5e8ac6a46 --- /dev/null +++ b/docs/api/utils.schemas.training.html @@ -0,0 +1,914 @@ + + + + + + + + + +utils.schemas.training – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.training

+

utils.schemas.training

+

Pydantic models for training hyperparameters

+
+

Classes

+ + + + + + + + + + + + + + + + + +
NameDescription
HyperparametersConfigTraining hyperparams configuration subset
LrGroupCustom learning rate group configuration
+
+

HyperparametersConfig

+
utils.schemas.training.HyperparametersConfig()
+

Training hyperparams configuration subset

+
+
+

LrGroup

+
utils.schemas.training.LrGroup()
+

Custom learning rate group configuration

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.trl.html b/docs/api/utils.schemas.trl.html new file mode 100644 index 000000000..567009bd8 --- /dev/null +++ b/docs/api/utils.schemas.trl.html @@ -0,0 +1,904 @@ + + + + + + + + + +utils.schemas.trl – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.trl

+

utils.schemas.trl

+

Pydantic models for TRL trainer configuration

+
+

Classes

+ + + + + + + + + + + + + +
NameDescription
TRLConfigInput args for TRL.
+
+

TRLConfig

+
utils.schemas.trl.TRLConfig()
+

Input args for TRL.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.schemas.utils.html b/docs/api/utils.schemas.utils.html new file mode 100644 index 000000000..1582e3a95 --- /dev/null +++ b/docs/api/utils.schemas.utils.html @@ -0,0 +1,987 @@ + + + + + + + + + +utils.schemas.utils – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.schemas.utils

+

utils.schemas.utils

+

Utilities for Axolotl Pydantic models

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
handle_legacy_message_fields_logicHandle backwards compatibility between legacy message field mapping and new property mapping system.
+
+

handle_legacy_message_fields_logic

+
utils.schemas.utils.handle_legacy_message_fields_logic(data)
+

Handle backwards compatibility between legacy message field mapping and new property mapping system.

+

Previously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options: +- message_field_role: Mapped to the role field +- message_field_content: Mapped to the content field

+

The new system uses message_property_mappings to support arbitrary field mappings: +message_property_mappings: +role: source_role_field +content: source_content_field +additional_field: source_field

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
datadictDictionary containing configuration datarequired
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
dictUpdated dictionary with message field mappings consolidated
+
+
+

Raises

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
ValueErrorIf there are conflicts between legacy and new mappings
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.tokenization.html b/docs/api/utils.tokenization.html new file mode 100644 index 000000000..2d01051db --- /dev/null +++ b/docs/api/utils.tokenization.html @@ -0,0 +1,924 @@ + + + + + + + + + +utils.tokenization – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.tokenization

+

utils.tokenization

+

Module for tokenization utilities

+
+

Functions

+ + + + + + + + + + + + + + + + + +
NameDescription
color_token_for_rl_debugHelper function to color tokens based on their type.
process_tokens_for_rl_debugHelper function to process and color tokens.
+
+

color_token_for_rl_debug

+
utils.tokenization.color_token_for_rl_debug(
+    decoded_token,
+    encoded_token,
+    color,
+    text_only,
+)
+

Helper function to color tokens based on their type.

+
+
+

process_tokens_for_rl_debug

+
utils.tokenization.process_tokens_for_rl_debug(
+    tokens,
+    color,
+    tokenizer,
+    text_only,
+)
+

Helper function to process and color tokens.

+ + +
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/api/utils.trainer.html b/docs/api/utils.trainer.html new file mode 100644 index 000000000..999a39bb7 --- /dev/null +++ b/docs/api/utils.trainer.html @@ -0,0 +1,1058 @@ + + + + + + + + + +utils.trainer – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ + + + +
+

utils.trainer

+

utils.trainer

+

Module containing the Trainer class and related functions

+
+

Functions

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameDescription
add_pose_position_idsuse the PoSE technique to extend the context length by randomly skipping
add_position_idsHandle both single-example and batched data.
drop_long_seqDrop samples whose sequence length is either too long (> sequence_len)
setup_trainerHelper method for instantiating and building a (causal or RLHF) trainer.
+
+

add_pose_position_ids

+
utils.trainer.add_pose_position_ids(
+    sample,
+    max_context_len=32768,
+    split_on_token_ids=None,
+    chunks=2,
+)
+

use the PoSE technique to extend the context length by randomly skipping +positions in the context. We only want to skip right before tokens in +the split_on_token_ids list. We should attempt to randomly distribute +the skips, but we don’t need the final position_ids to be the full +context_len. There may be multiple turns in the context, so we want to +make sure we take into account the maximum possible number of skips +remaining in each sample.

+
+
+

add_position_ids

+
utils.trainer.add_position_ids(sample)
+

Handle both single-example and batched data. +- single example: sample[‘input_ids’] is a list[int] +- batched data: sample[‘input_ids’] is a list[list[int]]

+
+
+

drop_long_seq

+
utils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)
+

Drop samples whose sequence length is either too long (> sequence_len) +or too short (< min_sequence_len).

+

Works for both single-example (list[int]) or batched (list[list[int]]).

+
+
+

setup_trainer

+
utils.trainer.setup_trainer(
+    cfg,
+    train_dataset,
+    eval_dataset,
+    model,
+    tokenizer,
+    processor,
+    total_num_steps,
+    model_ref=None,
+    peft_config=None,
+)
+

Helper method for instantiating and building a (causal or RLHF) trainer.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgAxolotl config object containing training parameters.required
train_datasetDataset to use for training.required
eval_datasetDataset to use for evaluation.required
modelThe model to train.required
tokenizerTokenizer for processing text input.required
processorProcessor for data preparation.required
total_num_stepsThe total number of training steps.required
model_refOptional reference model for RLHF training. Default is None.None
peft_configOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.None
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
A trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters.
+ + +
+
+
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html index c06b1a198..a228d8953 100644 --- a/docs/batch_vs_grad.html +++ b/docs/batch_vs_grad.html @@ -144,7 +144,7 @@ ul.task-list li input[type="checkbox"] { + @@ -424,7 +430,11 @@ ul.task-list li input[type="checkbox"] {
  • Memory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.

  • Gradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.

  • -

    Example 1: Micro batch size: 3 Gradient accumulation steps: 2 Number of GPUs: 3 Total batch size = 3 * 2 * 3 = 18

    +

    Example 1: +Micro batch size: 3 +Gradient accumulation steps: 2 +Number of GPUs: 3 +Total batch size = 3 * 2 * 3 = 18

    | GPU 1          | GPU 2          | GPU 3          |
     |----------------|----------------|----------------|
     | S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
    @@ -442,7 +452,11 @@ Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 +
     
     Weight update for w1:
     w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
    -

    Example 2: Micro batch size: 2 Gradient accumulation steps: 1 Number of GPUs: 3 Total batch size = 2 * 1 * 3 = 6

    +

    Example 2: +Micro batch size: 2 +Gradient accumulation steps: 1 +Number of GPUs: 3 +Total batch size = 2 * 1 * 3 = 6

    | GPU 1     | GPU 2     | GPU 3     |
     |-----------|-----------|-----------|
     | S1, S2    | S3, S4    | S5, S6    |
    diff --git a/docs/cli.html b/docs/cli.html
    index be8500746..6e9fa3898 100644
    --- a/docs/cli.html
    +++ b/docs/cli.html
    @@ -7,7 +7,7 @@
     
     
     
    -CLI Reference – Axolotl
    +Command Line Interface (CLI) – Axolotl
     
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    + +
    + + +
    + + + +
    + +
    +
    +

    Sequence Parallelism

    +
    + +
    +
    + Train with long sequences split across multiple GPUs. +
    +
    + + +
    + + + + +
    + + + +
    + + +
    +

    Sequence Parallelism

    +

    Sequence parallelism is a technique that splits sequences across multiple GPUs, +allowing you to train with very long sequences that wouldn’t fit on a single GPU. Each +GPU processes a different portion of the sequence, and the results are aggregated +through a ring communication pattern.

    +
    +

    When to Use Sequence Parallelism

    +

    Use sequence parallelism when:

    +
      +
    • You need to train with sequence lengths that don’t fit into a single GPU’s memory
    • +
    • You have multiple GPUs available
    • +
    • You’re experiencing OOM (Out Of Memory) errors with long sequences
    • +
    +
    +
    +

    Configuration

    +

    To enable sequence parallelism, add the following to your configuration file:

    +
    # Set to a divisor (> 1) of the number of GPUs available
    +sequence_parallel_degree: 4  # Split sequences across 4 GPUs
    +

    The sequence_parallel_degree should be a divisor of the total number of GPUs. For example:

    +
      +
    • With 8 GPUs, valid values would be 2, 4, or 8
    • +
    • With 4 GPUs, valid values would be 2 or 4
    • +
    +
    +
    +

    Implementation Details

    +

    When sequence parallelism is enabled:

    +
      +
    1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
    2. +
    3. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
    4. +
    5. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
    6. +
    7. The trainer uses special ring communication patterns for attention operations
    8. +
    +
    +
    +

    Requirements

    +

    To use sequence parallelism, you need:

    +
      +
    • Multiple GPUs (at least 2)
    • +
    • The ring-flash-attn package. Install with: +
        +
      • pip install axolotl[ring-flash-attn] (preferred)
      • +
      • pip install ring-flash-attn>=0.1.4
      • +
    • +
    +
    +
    +

    Limitations

    +
      +
    • Flash attention must be enabled for this to work (flash_attention: true in config YAML)
    • +
    • May have a small performance overhead due to communication between GPUs
    • +
    +
    +
    +

    Example

    +
    # Example config with sequence parallelism
    +base_model: meta-llama/Llama-3-8B-Instruct
    +sequence_len: 8192
    +sequence_parallel_degree: 2  # Split each sequence into 4 parts
    +flash_attention: true  # Required with sequence parallelism
    +...
    +

    This will train the Llama 3 8B model with 8K context length, with each sequence split +into 2 subsequences of length 4096 across 2 GPUs.

    +
    +
    +

    Sample Packing with Sequence Parallelism

    +

    Sequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:

    +
      +
    1. Samples are first packed together
    2. +
    3. The packed sequences are then divided across GPUs in the sequence parallel group
    4. +
    5. Position IDs are automatically adjusted to maintain proper relative positions
    6. +
    +
    +
    +

    Effect on Batch Size

    +

    When using sequence parallelism, your effective global batch size is divided by the sequence_parallel_degree. This happens because:

    +
      +
    • Each group of sequence_parallel_degree GPUs works on the same batch (just different parts of each sequence)
    • +
    • The number of batches processed per step decreases
    • +
    +

    For example: +- With 8 GPUs and no sequence parallelism: 8 different batches processed per step +- With 8 GPUs and sequence_parallel_degree=4: Only 2 different batches processed per step (each split across 4 GPUs) +- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4

    + + +
    +
    + +
    + +
    + + + + + \ No newline at end of file diff --git a/docs/torchao.html b/docs/torchao.html index b1352e71e..f66c89b5c 100644 --- a/docs/torchao.html +++ b/docs/torchao.html @@ -178,7 +178,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin + diff --git a/docs/unsloth.html b/docs/unsloth.html index 7def1fb08..ad87fd67d 100644 --- a/docs/unsloth.html +++ b/docs/unsloth.html @@ -178,7 +178,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin + @@ -463,7 +469,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

    Overview

    -

    Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over standard industry baselines.

    +

    Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over +standard industry baselines.

    @@ -487,7 +494,8 @@ Important

    Usage

    Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.

    -

    Our unsloth integration is currently limited to the following model architectures: - llama

    +

    Our unsloth integration is currently limited to the following model architectures: +- llama

    These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning

    unsloth_lora_mlp: true
     unsloth_lora_qkv: true
    diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
    index 09917ebd6..6bb7bec71 100644
    --- a/examples/colab-notebooks/colab-axolotl-example.html
    +++ b/examples/colab-notebooks/colab-axolotl-example.html
    @@ -206,7 +206,7 @@ window.Quarto = {
               
               
    + + @@ -660,7 +666,8 @@ window.Quarto = {

    Configuration Normalization

    -

    Axolotl uses a custom Dict class, called DictDefault to store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py

    +

    Axolotl uses a custom Dict class, called DictDefault +to store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py

    DictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py

    @@ -669,7 +676,8 @@ window.Quarto = {

    train() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.

    load_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.

    ModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/

    -

    Another important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py. trainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )

    +

    Another important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py. +trainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )

    Monkey patch

    diff --git a/index.html b/index.html index 92c9da742..186e78e5c 100644 --- a/index.html +++ b/index.html @@ -177,7 +177,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
    + + @@ -448,13 +454,35 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin

    - Axolotl + + + +Axolotl +

    -GitHub License tests Releases
    contributors GitHub Repo stars
    discord twitter
    tests-nightly multigpu-semi-weekly tests +GitHub License +tests +Releases +
    +contributors +GitHub Repo stars +
    +discord +twitter +
    +tests-nightly +multigpu-semi-weekly tests

    -

    Axolotl is a tool designed to streamline post-training for various AI models. Post-training refers to any modifications or additional training performed on pre-trained models - including full model fine-tuning, parameter-efficient tuning (like LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment techniques. With support for multiple model architectures and training configurations, Axolotl makes it easy to get started with these techniques.

    -

    Axolotl is designed to work with YAML config files that contain everything you need to preprocess a dataset, train or fine-tune a model, run model inference or evaluation, and much more.

    +

    Axolotl is a tool designed to streamline post-training for various AI models. +Post-training refers to any modifications or additional training performed on +pre-trained models - including full model fine-tuning, parameter-efficient tuning (like +LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment +techniques. With support for multiple model architectures and training configurations, +Axolotl makes it easy to get started with these techniques.

    +

    Axolotl is designed to work with YAML config files that contain everything you need to +preprocess a dataset, train or fine-tune a model, run model inference or evaluation, +and much more.

    Features:

    @@ -722,13 +751,17 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin -

    ✅: supported ❌: not supported ❓: untested

    +

    ✅: supported +❌: not supported +❓: untested

    ❤️ Sponsors

    Thank you to our sponsors who help make Axolotl possible:

      -
    • Modal - Modal lets you run jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, fine-tune large language models, run protein folding simulations, and much more.
    • +
    • Modal - Modal lets you run +jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, +fine-tune large language models, run protein folding simulations, and much more.

    Interested in sponsoring? Contact us at wing@axolotl.ai

    diff --git a/search.json b/search.json index 54d84d994..033f9f950 100644 --- a/search.json +++ b/search.json @@ -32,14 +32,14 @@ "href": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization", "title": "Setting up", "section": "Configuration Normalization", - "text": "Configuration Normalization\nAxolotl uses a custom Dict class, called DictDefault to store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py\nDictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py" + "text": "Configuration Normalization\nAxolotl uses a custom Dict class, called DictDefault\nto store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py\nDictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py" }, { "objectID": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer", "href": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer", "title": "Setting up", "section": "Loading Models, Tokenizers, and Trainer", - "text": "Loading Models, Tokenizers, and Trainer\nIf we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.\ntrain() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.\nload_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.\nModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/\nAnother important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py. trainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )" + "text": "Loading Models, Tokenizers, and Trainer\nIf we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.\ntrain() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.\nload_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.\nModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/\nAnother important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py.\ntrainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )" }, { "objectID": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch", @@ -53,7 +53,7 @@ "href": "docs/dataset-formats/stepwise_supervised.html", "title": "Stepwise Supervised Format", "section": "", - "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning datasets where each example contains multiple completion steps and a preference label for each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n \"completions\": [\n \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n ],\n \"labels\": [true, false]\n}", + "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n \"completions\": [\n \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n ],\n \"labels\": [true, false]\n}", "crumbs": [ "Dataset Formats", "Stepwise Supervised Format" @@ -64,7 +64,7 @@ "href": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised", "title": "Stepwise Supervised Format", "section": "", - "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning datasets where each example contains multiple completion steps and a preference label for each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n \"completions\": [\n \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n ],\n \"labels\": [true, false]\n}", + "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n \"completions\": [\n \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n ],\n \"labels\": [true, false]\n}", "crumbs": [ "Dataset Formats", "Stepwise Supervised Format" @@ -75,7 +75,7 @@ "href": "docs/dataset-formats/template_free.html", "title": "Template-Free", "section": "", - "text": "One of the most popular features of axolotl is setting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats such as alpaca or chatml, axolotl knows what is an input (i.e. human) vs. an output (i.e. the assistant) and masks the input labels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of these formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters <|im_start|> that can quickly become footguns if you don’t include them correctly at inference time.\nEnforce a chat interface when you do not want one. Sometimes you just want to fine-tune a model to a very specific task and do NOT want multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the input_output format, by setting type: input_output in your configuration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n - path: output.jsonl\n type: input_output # use template free prompt construction\nUnlike type: completion, which is also template-free, type: input_output allows you to mask segments of your text. More details on how this works are described below.", + "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters <|im_start|> that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n - path: output.jsonl\n type: input_output # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.", "crumbs": [ "Dataset Formats", "Template-Free" @@ -86,7 +86,7 @@ "href": "docs/dataset-formats/template_free.html#sec-background", "title": "Template-Free", "section": "", - "text": "One of the most popular features of axolotl is setting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats such as alpaca or chatml, axolotl knows what is an input (i.e. human) vs. an output (i.e. the assistant) and masks the input labels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of these formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters <|im_start|> that can quickly become footguns if you don’t include them correctly at inference time.\nEnforce a chat interface when you do not want one. Sometimes you just want to fine-tune a model to a very specific task and do NOT want multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the input_output format, by setting type: input_output in your configuration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n - path: output.jsonl\n type: input_output # use template free prompt construction\nUnlike type: completion, which is also template-free, type: input_output allows you to mask segments of your text. More details on how this works are described below.", + "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters <|im_start|> that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n - path: output.jsonl\n type: input_output # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.", "crumbs": [ "Dataset Formats", "Template-Free" @@ -97,7 +97,7 @@ "href": "docs/dataset-formats/template_free.html#sec-usage", "title": "Template-Free", "section": "Usage", - "text": "Usage\nThis is how you can use the input_output format:\n\n1. Prepare Data\nTo use the input_output format, collect your data in the following format into a jsonl file (below is the first row from the file output.jsonl` pretty printed):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n \"segments\": [\n {\n \"label\": true,\n \"text\": \"<s>Hello\\n\"\n },\n {\n \"label\": true,\n \"text\": \"hi there!. \"\n },\n {\n \"label\": false,\n \"text\": \"goodbye \"\n },\n {\n \"label\": true,\n \"text\": \"farewell</s>\"\n }\n ]\n}\n\nSet label:false when you want to mask a segment of text so that the model isn’t trained on it. Some things to keep in mind:\n\n[!IMPORTANT] 1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl concatenates all the segments as-is. The tokenizer doesn’t add anything additional. Notice how I added spaces, newlines, <s> (BOS), and </s> (EOS) myself. 2. Make sure you check the materialized output to validate that the prompt is getting assembled how you like.\n\n\n\n2. Use type: input_output\nLet’s materialize data with our output.jsonl file by setting type: input_output in our axolotl config:\n# training_config.yaml\nbase_model: mistralai/Mistral-7B-v0.1\ndata_seed: 49\nseed: 49\n\ndatasets:\n - path: output.jsonl\n type: input_output\nval_set_size: 0.1\n\nsequence_len: 896\nsample_packing: false\n\nmicro_batch_size: 2\ngradient_accumulation_steps: 3\neval_batch_size: 2\nnum_epochs: 1\nlearning_rate: 0.0002\n\ntrain_on_inputs: false\nspecial_tokens:\n bos_token: \"<s>\"\n eos_token: \"</s>\"\n unk_token: \"<unk>\"\nYou can use the following command to materialize your data. The --debug flag will print the tokens, along with the labels so you can verify that the correct items are being ignored:\naxolotl preprocess training_config.yaml --debug\n\n...\n[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] <s>(1, 1) Hello(22557, 22557)\n(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) </s>(2, 2)\nThe format is decoded_token(label, token_id), for example, <s>(1, 1) means that the token is <s>, the label is 1 and the token_id is 1. When the label is -100 then that token is ignored for training.\n\n\n3. Check the prompts\nHere is another way to check the materialized output:\nfrom transformers import AutoTokenizer\nfrom datasets import load_from_disk\nimport yaml\n\ndirectory = !ls last_run_prepared/\nwith open('training_config.yaml', 'r') as f:\n cfg = yaml.safe_load(f)\nmodel_id = cfg['base_model']\ntok = AutoTokenizer.from_pretrained(model_id)\nds = load_from_disk(f'last_run_prepared/{directory[0]}/')\n>>> row = ds[0]\n>>> print(tok.decode(row['input_ids']))\n<s> Hello\n hi there!. goodbye farewell</s>\nWe can check that the right tokens are ignored by comparing the labels to each token:\nimport pandas as pd\npd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in\n zip(row['input_ids'], row['labels'])])\n\n\n\ntoken\nlabel\nid\n\n\n\n\n0\n<s>\n1\n\n\n1\nHello\n22557\n\n\n2\n\\n\n13\n\n\n3\nhi\n12014\n\n\n4\nthere\n736\n\n\n5\n!\n28808\n\n\n6\n.\n28723\n\n\n7\n\n28705\n\n\n8\ngood\n-100\n\n\n9\nbye\n-100\n\n\n10\n\n-100\n\n\n11\nfare\n19111\n\n\n12\nwell\n5458\n\n\n13\n</s>\n2\n\n\n\nIf we look at the input data, the above table seems correct! (The jsonl version is repeated below for reference):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n \"segments\": [\n {\n \"label\": true,\n \"text\": \"<s>Hello\\n\"\n },\n {\n \"label\": true,\n \"text\": \"hi there!. \"\n },\n {\n \"label\": false,\n \"text\": \"goodbye \"\n },\n {\n \"label\": true,\n \"text\": \"farewell</s>\"\n }\n ]\n}", + "text": "Usage\nThis is how you can use the input_output format:\n\n1. Prepare Data\nTo use the input_output format, collect your data in the following\nformat into a jsonl file (below is the first row from the file\noutput.jsonl` pretty printed):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n \"segments\": [\n {\n \"label\": true,\n \"text\": \"<s>Hello\\n\"\n },\n {\n \"label\": true,\n \"text\": \"hi there!. \"\n },\n {\n \"label\": false,\n \"text\": \"goodbye \"\n },\n {\n \"label\": true,\n \"text\": \"farewell</s>\"\n }\n ]\n}\n\nSet label:false when you want to mask a segment of text so that the\nmodel isn’t trained on it. Some things to keep in mind:\n\n[!IMPORTANT]\n1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl\nconcatenates all the segments as-is. The tokenizer doesn’t add\nanything additional. Notice how I added spaces, newlines, <s>\n(BOS), and </s> (EOS) myself.\n2. Make sure you check the materialized output to validate that the\nprompt is getting assembled how you like.\n\n\n\n2. Use type: input_output\nLet’s materialize data with our output.jsonl file by setting\ntype: input_output in our axolotl config:\n# training_config.yaml\nbase_model: mistralai/Mistral-7B-v0.1\ndata_seed: 49\nseed: 49\n\ndatasets:\n - path: output.jsonl\n type: input_output\nval_set_size: 0.1\n\nsequence_len: 896\nsample_packing: false\n\nmicro_batch_size: 2\ngradient_accumulation_steps: 3\neval_batch_size: 2\nnum_epochs: 1\nlearning_rate: 0.0002\n\ntrain_on_inputs: false\nspecial_tokens:\n bos_token: \"<s>\"\n eos_token: \"</s>\"\n unk_token: \"<unk>\"\nYou can use the following command to materialize your data. The\n--debug flag will print the tokens, along with the labels so you can\nverify that the correct items are being ignored:\naxolotl preprocess training_config.yaml --debug\n\n...\n[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] <s>(1, 1) Hello(22557, 22557)\n(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) </s>(2, 2)\nThe format is decoded_token(label, token_id), for example,\n<s>(1, 1) means that the token is <s>, the label is 1 and the\ntoken_id is 1. When the label is -100 then that token is ignored for\ntraining.\n\n\n3. Check the prompts\nHere is another way to check the materialized output:\nfrom transformers import AutoTokenizer\nfrom datasets import load_from_disk\nimport yaml\n\ndirectory = !ls last_run_prepared/\nwith open('training_config.yaml', 'r') as f:\n cfg = yaml.safe_load(f)\nmodel_id = cfg['base_model']\ntok = AutoTokenizer.from_pretrained(model_id)\nds = load_from_disk(f'last_run_prepared/{directory[0]}/')\n>>> row = ds[0]\n>>> print(tok.decode(row['input_ids']))\n<s> Hello\n hi there!. goodbye farewell</s>\nWe can check that the right tokens are ignored by comparing the labels\nto each token:\nimport pandas as pd\npd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in\n zip(row['input_ids'], row['labels'])])\n\n\n\ntoken\nlabel\nid\n\n\n\n\n0\n<s>\n1\n\n\n1\nHello\n22557\n\n\n2\n\\n\n13\n\n\n3\nhi\n12014\n\n\n4\nthere\n736\n\n\n5\n!\n28808\n\n\n6\n.\n28723\n\n\n7\n\n28705\n\n\n8\ngood\n-100\n\n\n9\nbye\n-100\n\n\n10\n\n-100\n\n\n11\nfare\n19111\n\n\n12\nwell\n5458\n\n\n13\n</s>\n2\n\n\n\nIf we look at the input data, the above table seems correct! (The jsonl\nversion is repeated below for reference):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n \"segments\": [\n {\n \"label\": true,\n \"text\": \"<s>Hello\\n\"\n },\n {\n \"label\": true,\n \"text\": \"hi there!. \"\n },\n {\n \"label\": false,\n \"text\": \"goodbye \"\n },\n {\n \"label\": true,\n \"text\": \"farewell</s>\"\n }\n ]\n}", "crumbs": [ "Dataset Formats", "Template-Free" @@ -141,7 +141,7 @@ "href": "docs/amd_hpc.html#setup", "title": "AMD GPUs on HPC Systems", "section": "Setup", - "text": "Setup\n\n1. Install Python\nWe recommend using Miniforge, a minimal conda-based Python distribution:\ncurl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\"\nbash Miniforge3-$(uname)-$(uname -m).sh\n\n\n2. Configure Python Environment\nAdd Python to your PATH and ensure it’s available at login:\necho 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc\necho 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile\n\n\n3. Load AMD GPU Software\nLoad the ROCm module:\nmodule load rocm/5.7.1\nNote: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.\n\n\n4. Install PyTorch\nInstall PyTorch with ROCm support:\npip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall\n\n\n5. Install Flash Attention\nClone and install the Flash Attention repository:\ngit clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git\nexport GPU_ARCHS=\"gfx90a\"\ncd flash-attention\nexport PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')\npatch \"${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py\" hipify_patch.patch\npip install --no-build-isolation .\n\n\n6. Install Axolotl\nClone and install Axolotl:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\npip install packaging ninja\npip install --no-build-isolation -e .\n\n\n7. Apply xformers Workaround\nxformers appears to be incompatible with ROCm. Apply the following workarounds: - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers. - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.\n\n\n8. Prepare Job Submission Script\nCreate a script for job submission using your HPC’s particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include\nexport TRANSFORMERS_OFFLINE=1\nexport HF_DATASETS_OFFLINE=1\n\n\n9. Download Base Model\nDownload a base model using the Hugging Face CLI:\nhuggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B\n\n\n10. Create Axolotl Configuration\nCreate an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.\nNote: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.\n\n\n11. Preprocess Data\nRun preprocessing on the login node:\nCUDA_VISIBLE_DEVICES=\"\" python -m axolotl.cli.preprocess /path/to/your/config.yaml\n\n\n12. Train\nYou are now ready to submit your previously prepared job script. 🚂", + "text": "Setup\n\n1. Install Python\nWe recommend using Miniforge, a minimal conda-based Python distribution:\ncurl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\"\nbash Miniforge3-$(uname)-$(uname -m).sh\n\n\n2. Configure Python Environment\nAdd Python to your PATH and ensure it’s available at login:\necho 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc\necho 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile\n\n\n3. Load AMD GPU Software\nLoad the ROCm module:\nmodule load rocm/5.7.1\nNote: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.\n\n\n4. Install PyTorch\nInstall PyTorch with ROCm support:\npip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall\n\n\n5. Install Flash Attention\nClone and install the Flash Attention repository:\ngit clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git\nexport GPU_ARCHS=\"gfx90a\"\ncd flash-attention\nexport PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')\npatch \"${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py\" hipify_patch.patch\npip install --no-build-isolation .\n\n\n6. Install Axolotl\nClone and install Axolotl:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\npip install packaging ninja\npip install --no-build-isolation -e .\n\n\n7. Apply xformers Workaround\nxformers appears to be incompatible with ROCm. Apply the following workarounds:\n- Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers.\n- Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.\n\n\n8. Prepare Job Submission Script\nCreate a script for job submission using your HPC’s particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include\nexport TRANSFORMERS_OFFLINE=1\nexport HF_DATASETS_OFFLINE=1\n\n\n9. Download Base Model\nDownload a base model using the Hugging Face CLI:\nhuggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B\n\n\n10. Create Axolotl Configuration\nCreate an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.\nNote: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.\n\n\n11. Preprocess Data\nRun preprocessing on the login node:\nCUDA_VISIBLE_DEVICES=\"\" python -m axolotl.cli.preprocess /path/to/your/config.yaml\n\n\n12. Train\nYou are now ready to submit your previously prepared job script. 🚂", "crumbs": [ "Deployments", "AMD GPUs on HPC Systems" @@ -152,7 +152,7 @@ "href": "docs/config.html", "title": "Config Reference", "section": "", - "text": "# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files\n# This can also be a relative path to a model on disk\nbase_model: ./llama-7b-hf\n# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)\nbase_model_ignore_patterns:\n# If the base_model repo on hf hub doesn't include configuration .json files,\n# You can set that here, or leave this empty to default to base_model\nbase_model_config: ./llama-7b-hf\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model:\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config:\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too\nmodel_type: AutoModelForCausalLM\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: AutoTokenizer\n# Trust remote code for untrusted source\ntrust_remote_code:\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast:\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy:\n# Resize the model embeddings when new tokens are added to multiples of 32\n# This is reported to improve training speed on some models\nresize_token_embeddings_to_32x:\n# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings:\n\n# (Internal use only)\n# Used to identify which the model is based on\nis_falcon_derived_model:\nis_llama_derived_model:\nis_qwen_derived_model:\n# Please note that if you set this to true, `padding_side` will be set to \"left\" by default\nis_mistral_derived_model:\n\n# optional overrides to the base model configuration\noverrides_of_model_config:\n # RoPE Scaling https://github.com/huggingface/transformers/pull/24653\n rope_scaling:\n type: # linear | dynamic\n factor: # float\n\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs:\n # use_cache: False\n\n# optional overrides to the bnb 4bit quantization configuration\n# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig\nbnb_config_kwargs:\n # These are default values\n llm_int8_has_fp16_weight: false\n bnb_4bit_quant_type: nf4\n bnb_4bit_use_double_quant: true\n\n\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: true\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: true\n# Use bitsandbytes 4 bit\nload_in_4bit:\n\n# Use CUDA bf16\nbf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere\n# Use CUDA fp16\nfp16: true\n# Use CUDA tf32\ntf32: true # require >=ampere\n\n# No AMP (automatic mixed precision)\nbfloat16: true # require >=ampere\nfloat16: true\n\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset\ngpu_memory_limit: 20GiB\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: true\n\n# List[str]. Add plugins to extend the pipeline.\n# See `src/axolotl/integrations` for the available plugins or doc below for more details.\n# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html\nplugins:\n # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n# A list of one or more datasets to finetune the model with\ndatasets:\n # HuggingFace dataset repo | s3://,gs:// path | \"json\" for local dataset, make sure to fill data_files\n - path: vicgalle/alpaca-gpt4\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>\n ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file\n data_files: # Optional[str] path to source data files\n\n shards: # Optional[int] split dataset into N pieces (use with shards_idx)\n shards_idx: # Optional[int] = 0 the index of sharded dataset to use\n\n preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)\n\n name: # Optional[str] name of dataset configuration to load\n train_on_split: train # Optional[str] name of dataset split to load from\n revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.\n trust_remote_code: # Optional[bool] Trust remote code for untrusted source\n\n # Custom user instruction prompt\n - path: repo\n type:\n # The below are defaults. only set what's needed if you use a different column name.\n system_prompt: \"\"\n system_format: \"{system}\"\n field_system: system\n field_instruction: instruction\n field_input: input\n field_output: output\n\n # Customizable to be single line or multi-line\n # Use {instruction}/{input} as key to be replaced\n # 'format' can include {input}\n format: |-\n User: {instruction} {input}\n Assistant:\n # 'no_input_format' cannot include {input}\n no_input_format: \"{instruction} \"\n\n # For `completion` datsets only, uses the provided field instead of `text` column\n field:\n\n # Using chat template\n - path: ...\n # Set type to `chat_template` to use this strategy\n type: chat_template\n # Specify the name of the chat template to use\n # The name of the chat template to use for training, following values are supported:\n # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.\n # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.\n # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n chat_template: tokenizer_default\n\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja:\n\n # Key containing the messages (default: \"messages\")\n field_messages: messages\n\n # Mapping of properties from the input dataset to the chat template.\n # (default: message_property_mappings={'role':'role', 'content':'content'})\n # If a property exists in the template but not in this mapping, the system will attempt\n # to load it directly from the message using the property name as the key.\n # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',\n # while 'value' is loaded and used as 'content' in the chat template.\n message_property_mappings:\n role: from\n content: value\n # ...\n\n # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:\n roles:\n user: [\"human\", \"user\"]\n assistant: [\"gpt\", \"assistant\"]\n system: [\"system\"]\n tool: [\"tool\"]\n\n # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If you wish to,\n # we recommend using a custom jinja template with the default system message removed or\n # adding a system turn with empty content.\n drop_system_message:\n\n # IMPORTANT: The following fields determine which parts of the conversation to train on.\n # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train\n # See examples at `docs/dataset-formats/conversation.qmd`\n # Note: If the below 4 fields are set to empty, defaults to training only on the last message.\n\n # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: [\"assistant\"] # default\n # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:\n # - all: train on all EOS tokens\n # - turn (default): train on the EOS token at the end of each trainable turn\n # - last: train on the last EOS token in the conversation\n # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.\n train_on_eos: last\n # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.\n message_field_training: training\n # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.\n # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).\n message_field_training_detail: train_detail\n\n\n# If false, the datasets will not be shuffled and will keep their original order in `datasets`.\n# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: true\n\nDeduplicates datasets and test_datasets with identical entries.\ndataset_exact_deduplication: true\n\n# A list of one or more datasets to eval the model with.\n# You can use either test_datasets, or val_set_size, but not both.\ntest_datasets:\n - path: /workspace/data/eval.jsonl\n ds_type: json\n # You need to specify a split. For \"json\" datasets the default split is called \"train\".\n split: train\n type: completion\n data_files:\n - /workspace/data/eval.jsonl\n\n# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl:\nrl_beta: # Optional[float]. The beta parameter for the RL training.\n\n# dpo\ndpo_use_weighting: # Optional[bool]. Whether to perform weighting.\nrpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.\n\n# orpo\norpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.\n\n# kto\nkto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.\nkto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.\n\n# simpo\ncpo_alpha: 1.0 # Weight of the BC regularizer\nsimpo_gamma: 0.5 # Target reward margin for the SimPO loss\n\n# grpo\ntrl:\n use_vllm: # Optional[bool]. Whether to use VLLM for RL training.\n vllm_device: # Optional[str]. Device to use for VLLM.\n vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.\n vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.\n vllm_dtype: # Optional[str]. Data type for VLLM.\n\n beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use\n max_completion_length: # Optional[int]. Maximum length of the completion for RL training.\n\n reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.\n reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.\n\n num_generations: # Optional[int]. Number of generations to sample.\n log_completions: # Optional[bool]. Whether to log completions.\n\n sync_ref_model: # Optional[bool]. Whether to sync the reference model.\n ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.\n ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.\n\n\n# reward modelling: `True` or `False`\nreward_model:\n\n# process reward modelling: `True` or `False`\nprocess_reward_model:\n\n# The name of the chat template to use for training, following values are supported:\n# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.\n# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.\n# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n# The selected chat template will be saved to the tokenizer_config.json for easier inferencing\n# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.\nchat_template: tokenizer_default\n# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.\nchat_template_jinja: null\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: You are a helpful assistant. Please give a long and detailed answer.\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: data/last_run_prepared\n# Push prepared dataset to hub\npush_dataset_to_hub: # Optional[str] repo_org/repo_name\n# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`\n# if not set.\ndataset_processes: # defaults to os.cpu_count() if not set\n# Keep dataset in memory while preprocessing\n# Only needed if cached dataset is taking too much storage\ndataset_keep_in_memory:\n# push checkpoints to hub\nhub_model_id: # private repo path to push finetuned model\n# how to push checkpoints to hub\n# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy\nhub_strategy:\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets\n# Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: # boolean\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.\nval_set_size: 0.04\n# Num shards for whole dataset\ndataset_shard_num:\n# Index of shard to use for whole dataset\ndataset_shard_idx:\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: 2048\n# Pad inputs so each step uses constant sized buffers\n# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len:\n# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'\nsample_packing:\n# Set to 'false' if getting errors during eval with sample_packing on.\neval_sample_packing:\n# You can set these packing optimizations AFTER starting a training at least once.\n# The trainer will provide recommended values for these values.\nsample_packing_eff_est:\ntotal_num_tokens:\n# Increasing the following values helps with packing, but usually only slightly (<%1.)\n# The number of samples packed at a time.\nsample_packing_group_size: 100000\n# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.\nsample_packing_bin_size: 200\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation:\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening:\n\n# Passed through to transformers when loading the model when launched without accelerate\n# Use `sequential` when training w/ model parallelism to limit memory\ndevice_map:\n# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.\nmax_memory:\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model\nadapter: lora\n# If you already have a lora model trained that you want to load, put that here.\n# This means after training, if you want to test the model, you should set this to the value of `output_dir`.\n# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir:\n\n# LoRA hyperparameters\n# For more details about the following options, see:\n# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2\nlora_r: 8\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_modules:\n - q_proj\n - v_proj\n# - k_proj\n# - o_proj\n# - gate_proj\n# - down_proj\n# - up_proj\nlora_target_linear: # If true, will target all linear modules\npeft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers\n\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.\n# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\n# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994\nlora_modules_to_save:\n# - embed_tokens\n# - lm_head\n\nlora_fan_in_fan_out: false\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for\n# speed and memory savings\n# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n# LoRA+ hyperparameters\n# For more details about the following options, see:\n# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py`\nloraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6.\n\npeft:\n # Configuration options for loftq initialization for LoRA\n # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization\n loftq_config:\n loftq_bits: # typically 4 bits\n\n# ReLoRA configuration\n# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed\nrelora_steps: # Number of steps per ReLoRA restart\nrelora_warmup_steps: # Number of per-restart warmup steps\nrelora_anneal_steps: # Number of anneal steps for each relora cycle\nrelora_prune_ratio: # threshold for optimizer magnitude when pruning\nrelora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings\n\n# wandb configuration if you're using it\n# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.\nwandb_mode: # \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn off wandb\nwandb_project: # Your wandb project name\nwandb_entity: # A wandb Team name if using a Team\nwandb_watch:\nwandb_name: # Set the name of your wandb run\nwandb_run_id: # Set the ID of your wandb run\nwandb_log_model: # \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only at the end of training\n\n# mlflow configuration if you're using it\nmlflow_tracking_uri: # URI to mlflow\nmlflow_experiment_name: # Your experiment name\nmlflow_run_name: # Your run name\nhf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry\n\n# Comet configuration if you're using it\n# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.\n# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start\nuse_comet: # Enable or disable Comet integration.\ncomet_api_key: # API key for Comet. Recommended to set via `comet login`.\ncomet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.\ncomet_project_name: # Project name in Comet. Defaults to Uncategorized.\ncomet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.\ncomet_mode: # Create a new experiment (\"create\") or log to an existing one (\"get\"). Default (\"get_or_create\") auto-selects based on configuration.\ncomet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.\ncomet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.\n\n# Tensorboard\nuse_tensorboard: # Optional[bool]\n\n# Where to save the full-finetuned model to\noutput_dir: ./completed-model\n\n# Whether to use torch.compile and which backend to use\n# setting to `auto` will enable torch compile when torch>=2.5.1\ntorch_compile: # Optional[Union[Literal[\"auto\"], bool]]\ntorch_compile_backend: # Optional[str]\n\n# Training hyperparameters\n\n# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.\ngradient_accumulation_steps: 1\n# The number of samples to include in each batch. This is the number of samples sent to each GPU.\n# Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: 2\neval_batch_size:\nnum_epochs: 4\nwarmup_steps: 100 # cannot use with warmup_ratio\nwarmup_ratio: 0.05 # cannot use with warmup_steps\nlearning_rate: 0.00003\nlr_quadratic_warmup:\nlogging_steps:\neval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps\nevals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps\neval_strategy: # Set to `\"no\"` to skip evaluation, `\"epoch\"` at end of each epoch, leave empty to infer from `eval_steps`.\nsave_strategy: # Set to `\"no\"` to skip checkpoint saves, `\"epoch\"` at end of each epoch, `\"best\"` when better result is achieved, leave empty to infer from `save_steps`.\nsave_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps\nsaves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsave_total_limit: # Checkpoints saved at a time\n# Maximum number of iterations to train for. It precedes num_epochs which means that\n# if both are set, num_epochs will not be guaranteed.\n# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps:\n\n# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: # Optional[bool]\n\n# whether to find batch size that fits in memory. Passed to underlying transformers Trainer\nauto_find_batch_size: # Optional[bool]\n\neval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0\neval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128\neval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is [\"sacrebleu\", \"comet\", \"ter\", \"chrf\", \"perplexity\"]\n\nprofiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.\n # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information\n # snapshots can be visualized @ https://pytorch.org/memory_viz\n\nloss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)\nloss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)\n\n# Save model as safetensors (require safetensors package)\nsave_safetensors:\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: false\n# Group similarly sized data to minimize padding.\n# May be slower to start, as it must download and sort the entire dataset.\n# Note that training loss may have an oscillating pattern with this enabled.\ngroup_by_length: false\n\n# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: false\n# additional kwargs to pass to the trainer for gradient checkpointing\n# gradient_checkpointing_kwargs:\n# use_reentrant: true\n\n# Stop training after this many evaluation losses have increased in a row\n# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\nearly_stopping_patience: 3\n\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine\nlr_scheduler_kwargs:\ncosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr\ncosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)\n\n# For one_cycle optim\nlr_div_factor: # Learning rate div factor\n\n# Specify optimizer\n# Valid values are driven by the Transformers OptimizerNames class, see:\n# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134\n#\n# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of\n# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used\n# in the examples/ for your model and fine-tuning use case.\n#\n# Valid values for 'optimizer' include:\n# - adamw_hf\n# - adamw_torch\n# - adamw_torch_fused\n# - adamw_torch_xla\n# - adamw_apex_fused\n# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)\n# - adafactor\n# - adamw_anyprecision\n# - sgd\n# - adagrad\n# - adamw_bnb_8bit\n# - lion_8bit\n# - lion_32bit\n# - paged_adamw_32bit\n# - paged_adamw_8bit\n# - paged_lion_32bit\n# - paged_lion_8bit\n# - galore_adamw\n# - galore_adamw_8bit\n# - galore_adafactor\n# - galore_adamw_layerwise\n# - galore_adamw_8bit_layerwise\n# - galore_adafactor_layerwise\noptimizer:\n# Dictionary of arguments to pass to the optimizer\noptim_args:\n# For Galore Optimizers the following optim_args are available\n# rank: # type: int\n# update_proj_gap # type: int\n# scale # type: float\n# proj_type: # type: str, default = std\n\n# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\noptim_target_modules:\n# - self_attn # for llama\n# - mlp\n\n# Specify weight decay\nweight_decay:\n# adamw hyperparams\nadam_beta1:\nadam_beta2:\nadam_epsilon:\n# Gradient clipping max norm\nmax_grad_norm:\n\n# Augmentation techniques\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings\n# currently only supported on Llama and Mistral\nneftune_noise_alpha:\n\n# Whether to bettertransformers\nflash_optimum:\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:\nxformers_attention:\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:\nflash_attention:\nflash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_fuse_qkv: # Whether to fuse QKV into a single operation\nflash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation\n# Whether to use scaled-dot-product attention\n# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention:\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention:\n# Optional[bool]. Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage:\n# Resume from a specific checkpoint dir\nresume_from_checkpoint:\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: false\n\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank:\n\n# Add or change special tokens.\n# If you add tokens here, you don't need to add them to the `tokens` list.\nspecial_tokens:\n # bos_token: \"<s>\"\n # eos_token: \"</s>\"\n # unk_token: \"<unk>\"\n # pad_token: \"[PAD]\"\n\n# Add extra tokens.\ntokens:\n\n# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.\n# Only works for tokens that are not part of the base vocab (aka are added_tokens).\n# Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: # Dict[int, str]\n# 128041: \"<|im_start|>\"\n# 128042: \"<|im_end|>\"\n\n# FSDP\nfsdp:\nfsdp_config:\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed:\n\n# Advanced DDP Arguments\nddp_timeout:\nddp_bucket_cap_mb:\nddp_broadcast_buffers:\n\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path:\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset:\n\n# Debug mode\ndebug:\n\n# Seed\nseed:\n\n# Allow overwrite yml config using from cli\nstrict:", + "text": "# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files\n# This can also be a relative path to a model on disk\nbase_model: ./llama-7b-hf\n# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)\nbase_model_ignore_patterns:\n# If the base_model repo on hf hub doesn't include configuration .json files,\n# You can set that here, or leave this empty to default to base_model\nbase_model_config: ./llama-7b-hf\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model:\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config:\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too\nmodel_type: AutoModelForCausalLM\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: AutoTokenizer\n# Trust remote code for untrusted source\ntrust_remote_code:\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast:\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy:\n# Resize the model embeddings when new tokens are added to multiples of 32\n# This is reported to improve training speed on some models\nresize_token_embeddings_to_32x:\n# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings:\n# Whether to load the model with randomly initialized weights. Useful for\n# pre-training a model from scratch or debugging purposes.\nrandom_init_weights:\n\n# (Internal use only)\n# Used to identify which the model is based on\nis_falcon_derived_model:\nis_llama_derived_model:\nis_qwen_derived_model:\n# Please note that if you set this to true, `padding_side` will be set to \"left\" by default\nis_mistral_derived_model:\n\n# optional overrides to the base model configuration\noverrides_of_model_config:\n # RoPE Scaling https://github.com/huggingface/transformers/pull/24653\n rope_scaling:\n type: # linear | dynamic\n factor: # float\n\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs:\n # use_cache: False\n\n# optional overrides to the bnb 4bit quantization configuration\n# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig\nbnb_config_kwargs:\n # These are default values\n llm_int8_has_fp16_weight: false\n bnb_4bit_quant_type: nf4\n bnb_4bit_use_double_quant: true\n\n\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: true\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: true\n# Use bitsandbytes 4 bit\nload_in_4bit:\n\n# Use CUDA bf16\nbf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere\n# Use CUDA fp16\nfp16: true\n# Use CUDA tf32\ntf32: true # require >=ampere\n\n# No AMP (automatic mixed precision)\nbfloat16: true # require >=ampere\nfloat16: true\n\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset\ngpu_memory_limit: 20GiB\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: true\n\n# List[str]. Add plugins to extend the pipeline.\n# See `src/axolotl/integrations` for the available plugins or doc below for more details.\n# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html\nplugins:\n # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n# A list of one or more datasets to finetune the model with\ndatasets:\n # HuggingFace dataset repo | s3://,gs:// path | \"json\" for local dataset, make sure to fill data_files\n - path: vicgalle/alpaca-gpt4\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>\n ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file\n data_files: # Optional[str] path to source data files\n\n shards: # Optional[int] split dataset into N pieces (use with shards_idx)\n shards_idx: # Optional[int] = 0 the index of sharded dataset to use\n\n preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)\n\n name: # Optional[str] name of dataset configuration to load\n train_on_split: train # Optional[str] name of dataset split to load from\n revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.\n trust_remote_code: # Optional[bool] Trust remote code for untrusted source\n\n # Custom user instruction prompt\n - path: repo\n type:\n # The below are defaults. only set what's needed if you use a different column name.\n system_prompt: \"\"\n system_format: \"{system}\"\n field_system: system\n field_instruction: instruction\n field_input: input\n field_output: output\n\n # Customizable to be single line or multi-line\n # Use {instruction}/{input} as key to be replaced\n # 'format' can include {input}\n format: |-\n User: {instruction} {input}\n Assistant:\n # 'no_input_format' cannot include {input}\n no_input_format: \"{instruction} \"\n\n # For `completion` datsets only, uses the provided field instead of `text` column\n field:\n\n # Using chat template\n - path: ...\n # Set type to `chat_template` to use this strategy\n type: chat_template\n # Specify the name of the chat template to use\n # The name of the chat template to use for training, following values are supported:\n # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.\n # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.\n # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n chat_template: tokenizer_default\n\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja:\n\n # Key containing the messages (default: \"messages\")\n field_messages: messages\n\n # Mapping of properties from the input dataset to the chat template.\n # (default: message_property_mappings={'role':'role', 'content':'content'})\n # If a property exists in the template but not in this mapping, the system will attempt\n # to load it directly from the message using the property name as the key.\n # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',\n # while 'value' is loaded and used as 'content' in the chat template.\n message_property_mappings:\n role: from\n content: value\n # ...\n\n # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:\n roles:\n user: [\"human\", \"user\"]\n assistant: [\"gpt\", \"assistant\"]\n system: [\"system\"]\n tool: [\"tool\"]\n\n # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If you wish to,\n # we recommend using a custom jinja template with the default system message removed or\n # adding a system turn with empty content.\n drop_system_message:\n\n # IMPORTANT: The following fields determine which parts of the conversation to train on.\n # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train\n # See examples at `docs/dataset-formats/conversation.qmd`\n # Note: If the below 4 fields are set to empty, defaults to training only on the last message.\n\n # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: [\"assistant\"] # default\n # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:\n # - all: train on all EOS tokens\n # - turn (default): train on the EOS token at the end of each trainable turn\n # - last: train on the last EOS token in the conversation\n # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.\n train_on_eos: last\n # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.\n message_field_training: training\n # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.\n # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).\n message_field_training_detail: train_detail\n\n\n# If false, the datasets will not be shuffled and will keep their original order in `datasets`.\n# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: true\n\nDeduplicates datasets and test_datasets with identical entries.\ndataset_exact_deduplication: true\n\n# A list of one or more datasets to eval the model with.\n# You can use either test_datasets, or val_set_size, but not both.\ntest_datasets:\n - path: /workspace/data/eval.jsonl\n ds_type: json\n # You need to specify a split. For \"json\" datasets the default split is called \"train\".\n split: train\n type: completion\n data_files:\n - /workspace/data/eval.jsonl\n\n# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl:\nrl_beta: # Optional[float]. The beta parameter for the RL training.\n\n# dpo\ndpo_use_weighting: # Optional[bool]. Whether to perform weighting.\nrpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.\n\n# orpo\norpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.\n\n# kto\nkto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.\nkto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.\n\n# simpo\ncpo_alpha: 1.0 # Weight of the BC regularizer\nsimpo_gamma: 0.5 # Target reward margin for the SimPO loss\n\n# grpo\ntrl:\n use_vllm: # Optional[bool]. Whether to use VLLM for RL training.\n vllm_device: # Optional[str]. Device to use for VLLM.\n vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.\n vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.\n vllm_dtype: # Optional[str]. Data type for VLLM.\n\n beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use\n max_completion_length: # Optional[int]. Maximum length of the completion for RL training.\n\n reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.\n reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.\n\n num_generations: # Optional[int]. Number of generations to sample.\n log_completions: # Optional[bool]. Whether to log completions.\n\n sync_ref_model: # Optional[bool]. Whether to sync the reference model.\n ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.\n ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.\n\n\n# reward modelling: `True` or `False`\nreward_model:\n\n# process reward modelling: `True` or `False`\nprocess_reward_model:\n\n# The name of the chat template to use for training, following values are supported:\n# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.\n# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.\n# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n# The selected chat template will be saved to the tokenizer_config.json for easier inferencing\n# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.\nchat_template: tokenizer_default\n# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.\nchat_template_jinja: null\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: You are a helpful assistant. Please give a long and detailed answer.\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: data/last_run_prepared\n# Push prepared dataset to hub\npush_dataset_to_hub: # Optional[str] repo_org/repo_name\n# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`\n# if not set.\ndataset_processes: # defaults to os.cpu_count() if not set\n# Keep dataset in memory while preprocessing\n# Only needed if cached dataset is taking too much storage\ndataset_keep_in_memory:\n# push checkpoints to hub\nhub_model_id: # private repo path to push finetuned model\n# how to push checkpoints to hub\n# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy\nhub_strategy:\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets\n# Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: # boolean\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.\nval_set_size: 0.04\n# Num shards for whole dataset\ndataset_shard_num:\n# Index of shard to use for whole dataset\ndataset_shard_idx:\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: 2048\n# Pad inputs so each step uses constant sized buffers\n# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len:\n# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'\nsample_packing:\n# Set to 'false' if getting errors during eval with sample_packing on.\neval_sample_packing:\n# You can set these packing optimizations AFTER starting a training at least once.\n# The trainer will provide recommended values for these values.\nsample_packing_eff_est:\ntotal_num_tokens:\n# Increasing the following values helps with packing, but usually only slightly (<%1.)\n# The number of samples packed at a time.\nsample_packing_group_size: 100000\n# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.\nsample_packing_bin_size: 200\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation:\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening:\n\n# Passed through to transformers when loading the model when launched without accelerate\n# Use `sequential` when training w/ model parallelism to limit memory\ndevice_map:\n# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.\nmax_memory:\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model\nadapter: lora\n# If you already have a lora model trained that you want to load, put that here.\n# This means after training, if you want to test the model, you should set this to the value of `output_dir`.\n# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir:\n\n# LoRA hyperparameters\n# For more details about the following options, see:\n# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2\nlora_r: 8\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_modules:\n - q_proj\n - v_proj\n# - k_proj\n# - o_proj\n# - gate_proj\n# - down_proj\n# - up_proj\nlora_target_linear: # If true, will target all linear modules\npeft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers\n\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.\n# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\n# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994\nlora_modules_to_save:\n# - embed_tokens\n# - lm_head\n\nlora_fan_in_fan_out: false\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for\n# speed and memory savings\n# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n# LoRA+ hyperparameters\n# For more details about the following options, see:\n# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py`\nloraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6.\n\npeft:\n # Configuration options for loftq initialization for LoRA\n # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization\n loftq_config:\n loftq_bits: # typically 4 bits\n\n# ReLoRA configuration\n# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed\nrelora_steps: # Number of steps per ReLoRA restart\nrelora_warmup_steps: # Number of per-restart warmup steps\nrelora_anneal_steps: # Number of anneal steps for each relora cycle\nrelora_prune_ratio: # threshold for optimizer magnitude when pruning\nrelora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings\n\n# wandb configuration if you're using it\n# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.\nwandb_mode: # \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn off wandb\nwandb_project: # Your wandb project name\nwandb_entity: # A wandb Team name if using a Team\nwandb_watch:\nwandb_name: # Set the name of your wandb run\nwandb_run_id: # Set the ID of your wandb run\nwandb_log_model: # \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only at the end of training\n\n# mlflow configuration if you're using it\nmlflow_tracking_uri: # URI to mlflow\nmlflow_experiment_name: # Your experiment name\nmlflow_run_name: # Your run name\nhf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry\n\n# Comet configuration if you're using it\n# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.\n# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start\nuse_comet: # Enable or disable Comet integration.\ncomet_api_key: # API key for Comet. Recommended to set via `comet login`.\ncomet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.\ncomet_project_name: # Project name in Comet. Defaults to Uncategorized.\ncomet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.\ncomet_mode: # Create a new experiment (\"create\") or log to an existing one (\"get\"). Default (\"get_or_create\") auto-selects based on configuration.\ncomet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.\ncomet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.\n\n# Tensorboard\nuse_tensorboard: # Optional[bool]\n\n# Where to save the full-finetuned model to\noutput_dir: ./completed-model\n\n# Whether to use torch.compile and which backend to use\n# setting to `auto` will enable torch compile when torch>=2.5.1\ntorch_compile: # Optional[Union[Literal[\"auto\"], bool]]\ntorch_compile_backend: # Optional[str]\n\n# Training hyperparameters\n\n# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.\ngradient_accumulation_steps: 1\n# The number of samples to include in each batch. This is the number of samples sent to each GPU.\n# Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: 2\neval_batch_size:\nnum_epochs: 4\nwarmup_steps: 100 # cannot use with warmup_ratio\nwarmup_ratio: 0.05 # cannot use with warmup_steps\nlearning_rate: 0.00003\nlr_quadratic_warmup:\nlogging_steps:\neval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps\nevals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps\neval_strategy: # Set to `\"no\"` to skip evaluation, `\"epoch\"` at end of each epoch, leave empty to infer from `eval_steps`.\nsave_strategy: # Set to `\"no\"` to skip checkpoint saves, `\"epoch\"` at end of each epoch, `\"best\"` when better result is achieved, leave empty to infer from `save_steps`.\nsave_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps\nsaves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsave_total_limit: # Checkpoints saved at a time\n# Maximum number of iterations to train for. It precedes num_epochs which means that\n# if both are set, num_epochs will not be guaranteed.\n# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps:\n\n# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: # Optional[bool]\n\n# whether to find batch size that fits in memory. Passed to underlying transformers Trainer\nauto_find_batch_size: # Optional[bool]\n\neval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0\neval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128\neval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is [\"sacrebleu\", \"comet\", \"ter\", \"chrf\", \"perplexity\"]\n\nprofiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.\n # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information\n # snapshots can be visualized @ https://pytorch.org/memory_viz\n\nloss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)\nloss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)\n\n# Save model as safetensors (require safetensors package)\nsave_safetensors:\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: false\n# Group similarly sized data to minimize padding.\n# May be slower to start, as it must download and sort the entire dataset.\n# Note that training loss may have an oscillating pattern with this enabled.\ngroup_by_length: false\n\n# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: false\n# additional kwargs to pass to the trainer for gradient checkpointing\n# gradient_checkpointing_kwargs:\n# use_reentrant: true\n\n# Stop training after this many evaluation losses have increased in a row\n# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\nearly_stopping_patience: 3\n\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine\nlr_scheduler_kwargs:\ncosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr\ncosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)\n\n# For one_cycle optim\nlr_div_factor: # Learning rate div factor\n\n# Specify optimizer\n# Valid values are driven by the Transformers OptimizerNames class, see:\n# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134\n#\n# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of\n# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used\n# in the examples/ for your model and fine-tuning use case.\n#\n# Valid values for 'optimizer' include:\n# - adamw_hf\n# - adamw_torch\n# - adamw_torch_fused\n# - adamw_torch_xla\n# - adamw_apex_fused\n# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)\n# - adafactor\n# - adamw_anyprecision\n# - sgd\n# - adagrad\n# - adamw_bnb_8bit\n# - lion_8bit\n# - lion_32bit\n# - paged_adamw_32bit\n# - paged_adamw_8bit\n# - paged_lion_32bit\n# - paged_lion_8bit\n# - galore_adamw\n# - galore_adamw_8bit\n# - galore_adafactor\n# - galore_adamw_layerwise\n# - galore_adamw_8bit_layerwise\n# - galore_adafactor_layerwise\noptimizer:\n# Dictionary of arguments to pass to the optimizer\noptim_args:\n# For Galore Optimizers the following optim_args are available\n# rank: # type: int\n# update_proj_gap # type: int\n# scale # type: float\n# proj_type: # type: str, default = std\n\n# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\noptim_target_modules:\n# - self_attn # for llama\n# - mlp\n\n# Specify weight decay\nweight_decay:\n# adamw hyperparams\nadam_beta1:\nadam_beta2:\nadam_epsilon:\n# Gradient clipping max norm\nmax_grad_norm:\n\n# Augmentation techniques\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings\n# currently only supported on Llama and Mistral\nneftune_noise_alpha:\n\n# Whether to bettertransformers\nflash_optimum:\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:\nxformers_attention:\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:\nflash_attention:\nflash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_fuse_qkv: # Whether to fuse QKV into a single operation\nflash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation\n# Whether to use scaled-dot-product attention\n# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention:\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention:\n# Optional[bool]. Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage:\n# Resume from a specific checkpoint dir\nresume_from_checkpoint:\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: false\n\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank:\n\n# Add or change special tokens.\n# If you add tokens here, you don't need to add them to the `tokens` list.\nspecial_tokens:\n # bos_token: \"<s>\"\n # eos_token: \"</s>\"\n # unk_token: \"<unk>\"\n # pad_token: \"[PAD]\"\n\n# Add extra tokens.\ntokens:\n\n# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.\n# Only works for tokens that are not part of the base vocab (aka are added_tokens).\n# Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: # Dict[int, str]\n# 128041: \"<|im_start|>\"\n# 128042: \"<|im_end|>\"\n\n# FSDP\nfsdp:\nfsdp_config:\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed:\n\n# Advanced DDP Arguments\nddp_timeout:\nddp_bucket_cap_mb:\nddp_broadcast_buffers:\n\n# Sequence parallelism\n# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.\n# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.\n# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized\n# subsequences, or set to 4 to split into four equal-sized subsequences.\n# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.\nsequence_parallel_degree:\n\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path:\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset:\n\n# Debug mode\ndebug:\n\n# Seed\nseed:\n\n# Allow overwrite yml config using from cli\nstrict:", "crumbs": [ "Getting Started", "Config Reference" @@ -251,7 +251,7 @@ "href": "docs/installation.html#sec-installation-methods", "title": "Installation", "section": "2 Installation Methods", - "text": "2 Installation Methods\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if installed) in order not to clobber it, and so that we set the correct version of dependencies that are specific to the PyTorch version or other installed co-dependencies.\n\n\n2.2 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.3 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n --name axolotl --ipc=host \\\n --ulimit memlock=-1 --ulimit stack=67108864 \\\n --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n axolotlai/axolotl:main-latest\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.", + "text": "2 Installation Methods\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if\ninstalled) in order not to clobber it, and so that we set the correct version of\ndependencies that are specific to the PyTorch version or other installed\nco-dependencies.\n\n\n2.2 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.3 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n --name axolotl --ipc=host \\\n --ulimit memlock=-1 --ulimit stack=67108864 \\\n --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n axolotlai/axolotl:main-latest\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.", "crumbs": [ "Getting Started", "Installation" @@ -317,7 +317,7 @@ "href": "docs/reward_modelling.html", "title": "Reward Modelling", "section": "", - "text": "Overview\nReward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions. We support the reward modelling techniques supported by trl.\n\n\n(Outcome) Reward Models\nOutcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).\nbase_model: google/gemma-2-2b\nmodel_type: AutoModelForSequenceClassification\nnum_labels: 1\ntokenizer_type: AutoTokenizer\n\nreward_model: true\nchat_template: gemma\ndatasets:\n - path: argilla/distilabel-intel-orca-dpo-pairs\n type: bradley_terry.chat_template\n\nval_set_size: 0.1\neval_steps: 100\nBradley-Terry chat templates expect single-turn conversations in the following format:\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nProcess Reward Models (PRM)\n\n\n\n\n\n\nTip\n\n\n\nCheck out our PRM blog.\n\n\nProcess reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.\nbase_model: Qwen/Qwen2.5-3B\nmodel_type: AutoModelForTokenClassification\nnum_labels: 2\n\nprocess_reward_model: true\ndatasets:\n - path: trl-lib/math_shepherd\n type: stepwise_supervised\n split: train\n\nval_set_size: 0.1\neval_steps: 100\nPlease see stepwise_supervised for more details on the dataset format.", + "text": "Overview\nReward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions.\nWe support the reward modelling techniques supported by trl.\n\n\n(Outcome) Reward Models\nOutcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).\nbase_model: google/gemma-2-2b\nmodel_type: AutoModelForSequenceClassification\nnum_labels: 1\ntokenizer_type: AutoTokenizer\n\nreward_model: true\nchat_template: gemma\ndatasets:\n - path: argilla/distilabel-intel-orca-dpo-pairs\n type: bradley_terry.chat_template\n\nval_set_size: 0.1\neval_steps: 100\nBradley-Terry chat templates expect single-turn conversations in the following format:\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nProcess Reward Models (PRM)\n\n\n\n\n\n\nTip\n\n\n\nCheck out our PRM blog.\n\n\nProcess reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.\nbase_model: Qwen/Qwen2.5-3B\nmodel_type: AutoModelForTokenClassification\nnum_labels: 2\n\nprocess_reward_model: true\ndatasets:\n - path: trl-lib/math_shepherd\n type: stepwise_supervised\n split: train\n\nval_set_size: 0.1\neval_steps: 100\nPlease see stepwise_supervised for more details on the dataset format.", "crumbs": [ "How To Guides", "Reward Modelling" @@ -335,12 +335,943 @@ "href": "docs/multimodal.html", "title": "MultiModal / Vision Language Models (BETA)", "section": "", - "text": "MultiModal / Vision Language Models (BETA)\n\nSupported Models\n\nMllama, i.e. llama with vision models\n\n\n\nUsage\nCurrently multimodal support is limited and doesn’t have full feature parity. To finetune a multimodal Llama w/ LoRA, you’ll need to use the following in YAML in combination with the rest of the required hyperparams.\nbase_model: alpindale/Llama-3.2-11B-Vision-Instruct\nprocessor_type: AutoProcessor\nskip_prepare_dataset: true\n\nchat_template: llama3_2_vision\ndatasets:\n - path: HuggingFaceH4/llava-instruct-mix-vsft\n type: chat_template\n split: train[:1%]\n field_messages: messages\nremove_unused_columns: false\nsample_packing: false\n\n# only finetune the Language model, leave the vision model and vision tower frozen\nlora_target_modules: 'language_model.model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'", + "text": "MultiModal / Vision Language Models (BETA)\n\nSupported Models\n\nMllama, i.e. llama with vision models\n\n\n\nUsage\nCurrently multimodal support is limited and doesn’t have full feature parity. To finetune a multimodal Llama w/ LoRA,\nyou’ll need to use the following in YAML in combination with the rest of the required hyperparams.\nbase_model: alpindale/Llama-3.2-11B-Vision-Instruct\nprocessor_type: AutoProcessor\nskip_prepare_dataset: true\n\nchat_template: llama3_2_vision\ndatasets:\n - path: HuggingFaceH4/llava-instruct-mix-vsft\n type: chat_template\n split: train[:1%]\n field_messages: messages\nremove_unused_columns: false\nsample_packing: false\n\n# only finetune the Language model, leave the vision model and vision tower frozen\nlora_target_modules: 'language_model.model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'", "crumbs": [ "How To Guides", "MultiModal / Vision Language Models (BETA)" ] }, + { + "objectID": "docs/api/utils.callbacks.mlflow_.html", + "href": "docs/api/utils.callbacks.mlflow_.html", + "title": "utils.callbacks.mlflow_", + "section": "", + "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(\n self,\n axolotl_config_path,\n)\nCallback to save axolotl config to mlflow" + }, + { + "objectID": "docs/api/utils.callbacks.mlflow_.html#classes", + "href": "docs/api/utils.callbacks.mlflow_.html#classes", + "title": "utils.callbacks.mlflow_", + "section": "", + "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(\n self,\n axolotl_config_path,\n)\nCallback to save axolotl config to mlflow" + }, + { + "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html", + "href": "docs/api/monkeypatch.trainer_fsdp_optim.html", + "title": "monkeypatch.trainer_fsdp_optim", + "section": "", + "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save" + }, + { + "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions", + "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions", + "title": "monkeypatch.trainer_fsdp_optim", + "section": "", + "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save" + }, + { + "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html", + "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html", + "title": "monkeypatch.data.batch_dataset_fetcher", + "section": "", + "text": "monkeypatch.data.batch_dataset_fetcher\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes" + }, + { + "objectID": "docs/api/prompt_strategies.stepwise_supervised.html", + "href": "docs/api/prompt_strategies.stepwise_supervised.html", + "title": "prompt_strategies.stepwise_supervised", + "section": "", + "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n self,\n tokenizer,\n sequence_len=2048,\n step_separator='\\n',\n max_completion_length=None,\n train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step" + }, + { + "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes", + "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes", + "title": "prompt_strategies.stepwise_supervised", + "section": "", + "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n self,\n tokenizer,\n sequence_len=2048,\n step_separator='\\n',\n max_completion_length=None,\n train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step" + }, + { + "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html", + "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html", + "title": "monkeypatch.mistral_attn_hijack_flash", + "section": "", + "text": "monkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\n\n\n\nName\nDescription\n\n\n\n\nMistralDecoderLayer\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n cu_seqlens=None,\n max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.generate_qkv(\n q,\n k,\n v,\n query_padding_mask=None,\n key_padding_mask=None,\n kvpacked=False,\n qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone" + }, + { + "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#classes", + "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#classes", + "title": "monkeypatch.mistral_attn_hijack_flash", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMistralDecoderLayer\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n cu_seqlens=None,\n max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone" + }, + { + "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#functions", + "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#functions", + "title": "monkeypatch.mistral_attn_hijack_flash", + "section": "", + "text": "Name\nDescription\n\n\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.generate_qkv(\n q,\n k,\n v,\n query_padding_mask=None,\n key_padding_mask=None,\n kvpacked=False,\n qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.user_defined.html", + "href": "docs/api/prompt_strategies.dpo.user_defined.html", + "title": "prompt_strategies.dpo.user_defined", + "section": "", + "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies" + }, + { + "objectID": "docs/api/integrations.liger.args.html", + "href": "docs/api/integrations.liger.args.html", + "title": "integrations.liger.args", + "section": "", + "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER." + }, + { + "objectID": "docs/api/integrations.liger.args.html#classes", + "href": "docs/api/integrations.liger.args.html#classes", + "title": "integrations.liger.args", + "section": "", + "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER." + }, + { + "objectID": "docs/api/utils.schemas.training.html", + "href": "docs/api/utils.schemas.training.html", + "title": "utils.schemas.training", + "section": "", + "text": "utils.schemas.training\nPydantic models for training hyperparameters\n\n\n\n\n\nName\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration" + }, + { + "objectID": "docs/api/utils.schemas.training.html#classes", + "href": "docs/api/utils.schemas.training.html#classes", + "title": "utils.schemas.training", + "section": "", + "text": "Name\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration" + }, + { + "objectID": "docs/api/datasets.html", + "href": "docs/api/datasets.html", + "title": "datasets", + "section": "", + "text": "datasets\nModule containing Dataset functionality\n\n\n\n\n\nName\nDescription\n\n\n\n\nConstantLengthDataset\nIterable dataset that returns constant length chunks of tokens from stream of text files.\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.ConstantLengthDataset(self, tokenizer, datasets, seq_length=2048)\nIterable dataset that returns constant length chunks of tokens from stream of text files.\nArgs:\ntokenizer (Tokenizer): The processor used for processing the data.\ndataset (dataset.Dataset): Dataset with text files.\nseq_length (int): Length of token sequences to return.\n\n\n\ndatasets.TokenizedPromptDataset(\n self,\n prompt_tokenizer,\n dataset,\n process_count=None,\n keep_in_memory=False,\n **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\nArgs:\nprompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.\ndataset (dataset.Dataset): Dataset with text files.\nprocess_count (int): Number of processes to use for tokenizing.\nkeep_in_memory (bool): Whether to keep the tokenized dataset in memory." + }, + { + "objectID": "docs/api/datasets.html#classes", + "href": "docs/api/datasets.html#classes", + "title": "datasets", + "section": "", + "text": "Name\nDescription\n\n\n\n\nConstantLengthDataset\nIterable dataset that returns constant length chunks of tokens from stream of text files.\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.ConstantLengthDataset(self, tokenizer, datasets, seq_length=2048)\nIterable dataset that returns constant length chunks of tokens from stream of text files.\nArgs:\ntokenizer (Tokenizer): The processor used for processing the data.\ndataset (dataset.Dataset): Dataset with text files.\nseq_length (int): Length of token sequences to return.\n\n\n\ndatasets.TokenizedPromptDataset(\n self,\n prompt_tokenizer,\n dataset,\n process_count=None,\n keep_in_memory=False,\n **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\nArgs:\nprompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.\ndataset (dataset.Dataset): Dataset with text files.\nprocess_count (int): Number of processes to use for tokenizing.\nkeep_in_memory (bool): Whether to keep the tokenized dataset in memory." + }, + { + "objectID": "docs/api/kernels.geglu.html", + "href": "docs/api/kernels.geglu.html", + "title": "kernels.geglu", + "section": "", + "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]." + }, + { + "objectID": "docs/api/kernels.geglu.html#functions", + "href": "docs/api/kernels.geglu.html#functions", + "title": "kernels.geglu", + "section": "", + "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]." + }, + { + "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html", + "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html", + "title": "monkeypatch.llama_attn_hijack_flash", + "section": "", + "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nFusedAttention\nFused QKV Attention layer for incrementally improved training efficiency\n\n\nLlamaDecoderLayer\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)\nFused QKV Attention layer for incrementally improved training efficiency\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n padding_mask=None,\n cu_seqlens=None,\n max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward\nInput shape: Batch x Time x Channel\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward(\n self,\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n padding_mask=None,\n cu_seqlens=None,\n max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nattention_mask: [bsz, q_len]\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n self,\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n padding_mask=None,\n cu_seqlens=None,\n max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided\n\n\n\nmonkeypatch.llama_attn_hijack_flash.generate_qkv(\n q,\n k,\n v,\n query_padding_mask=None,\n key_padding_mask=None,\n kvpacked=False,\n qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone" + }, + { + "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#classes", + "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#classes", + "title": "monkeypatch.llama_attn_hijack_flash", + "section": "", + "text": "Name\nDescription\n\n\n\n\nFusedAttention\nFused QKV Attention layer for incrementally improved training efficiency\n\n\nLlamaDecoderLayer\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)\nFused QKV Attention layer for incrementally improved training efficiency\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n padding_mask=None,\n cu_seqlens=None,\n max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone" + }, + { + "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions", + "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions", + "title": "monkeypatch.llama_attn_hijack_flash", + "section": "", + "text": "Name\nDescription\n\n\n\n\nflashattn_forward\nInput shape: Batch x Time x Channel\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward(\n self,\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n padding_mask=None,\n cu_seqlens=None,\n max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nattention_mask: [bsz, q_len]\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n self,\n hidden_states,\n attention_mask=None,\n position_ids=None,\n past_key_value=None,\n output_attentions=False,\n use_cache=False,\n padding_mask=None,\n cu_seqlens=None,\n max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided\n\n\n\nmonkeypatch.llama_attn_hijack_flash.generate_qkv(\n q,\n k,\n v,\n query_padding_mask=None,\n key_padding_mask=None,\n kvpacked=False,\n qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone" + }, + { + "objectID": "docs/api/cli.sweeps.html", + "href": "docs/api/cli.sweeps.html", + "title": "cli.sweeps", + "section": "", + "text": "cli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, list]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}" + }, + { + "objectID": "docs/api/cli.sweeps.html#functions", + "href": "docs/api/cli.sweeps.html#functions", + "title": "cli.sweeps", + "section": "", + "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, list]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}" + }, + { + "objectID": "docs/api/utils.freeze.html", + "href": "docs/api/utils.freeze.html", + "title": "utils.freeze", + "section": "", + "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(self, pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place." + }, + { + "objectID": "docs/api/utils.freeze.html#classes", + "href": "docs/api/utils.freeze.html#classes", + "title": "utils.freeze", + "section": "", + "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(self, pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise." + }, + { + "objectID": "docs/api/utils.freeze.html#functions", + "href": "docs/api/utils.freeze.html#functions", + "title": "utils.freeze", + "section": "", + "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place." + }, + { + "objectID": "docs/api/monkeypatch.multipack.html", + "href": "docs/api/monkeypatch.multipack.html", + "title": "monkeypatch.multipack", + "section": "", + "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing" + }, + { + "objectID": "docs/api/cli.main.html", + "href": "docs/api/cli.main.html", + "title": "cli.main", + "section": "", + "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(config, accelerate, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(config, accelerate, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nOptional[str]\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}" + }, + { + "objectID": "docs/api/cli.main.html#functions", + "href": "docs/api/cli.main.html#functions", + "title": "cli.main", + "section": "", + "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(config, accelerate, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(config, accelerate, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nOptional[str]\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}" + }, + { + "objectID": "docs/api/core.trainers.trl.html", + "href": "docs/api/core.trainers.trl.html", + "title": "core.trainers.trl", + "section": "", + "text": "core.trainers.trl\nModule for TRL PPO trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\nTRLPPOTrainer\nWrapper for TRL PPO trainer to handle customizations\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer()\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer()\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer()\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer()\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer()\nExtend the base RewardTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.TRLPPOTrainer()\nWrapper for TRL PPO trainer to handle customizations" + }, + { + "objectID": "docs/api/core.trainers.trl.html#classes", + "href": "docs/api/core.trainers.trl.html#classes", + "title": "core.trainers.trl", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\nTRLPPOTrainer\nWrapper for TRL PPO trainer to handle customizations\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer()\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer()\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer()\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer()\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer()\nExtend the base RewardTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.TRLPPOTrainer()\nWrapper for TRL PPO trainer to handle customizations" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.passthrough.html", + "href": "docs/api/prompt_strategies.dpo.passthrough.html", + "title": "prompt_strategies.dpo.passthrough", + "section": "", + "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy" + }, + { + "objectID": "docs/api/core.chat.format.llama3x.html", + "href": "docs/api/core.chat.format.llama3x.html", + "title": "core.chat.format.llama3x", + "section": "", + "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents" + }, + { + "objectID": "docs/api/core.datasets.transforms.chat_builder.html", + "href": "docs/api/core.datasets.transforms.chat_builder.html", + "title": "core.datasets.transforms.chat_builder", + "section": "", + "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n train_on_inputs=False,\n conversations_field='conversations',\n message_field_role=['role', 'from'],\n message_field_content=['value', 'text', 'content'],\n message_field_training=['train', 'weight'],\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\n['role', 'from']\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\n['value', 'text', 'content']\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\n['train', 'weight']\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages." + }, + { + "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions", + "href": "docs/api/core.datasets.transforms.chat_builder.html#functions", + "title": "core.datasets.transforms.chat_builder", + "section": "", + "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n train_on_inputs=False,\n conversations_field='conversations',\n message_field_role=['role', 'from'],\n message_field_content=['value', 'text', 'content'],\n message_field_training=['train', 'weight'],\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\n['role', 'from']\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\n['value', 'text', 'content']\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\n['train', 'weight']\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages." + }, + { + "objectID": "docs/api/prompt_strategies.kto.user_defined.html", + "href": "docs/api/prompt_strategies.kto.user_defined.html", + "title": "prompt_strategies.kto.user_defined", + "section": "", + "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies" + }, + { + "objectID": "docs/api/utils.collators.mamba.html", + "href": "docs/api/utils.collators.mamba.html", + "title": "utils.collators.mamba", + "section": "", + "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(self, tokenizer)\nCollator for State Space Models (Mamba)" + }, + { + "objectID": "docs/api/utils.collators.mamba.html#classes", + "href": "docs/api/utils.collators.mamba.html#classes", + "title": "utils.collators.mamba", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(self, tokenizer)\nCollator for State Space Models (Mamba)" + }, + { + "objectID": "docs/api/integrations.base.html", + "href": "docs/api/integrations.base.html", + "title": "integrations.base", + "section": "", + "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\nintegrations.base.BasePlugin(self)\nBase class for all plugins. Defines the interface for plugin methods.\nAttributes:\nNone\nMethods:\nregister(cfg): Registers the plugin with the given configuration.\npre_model_load(cfg): Performs actions before the model is loaded.\npost_model_load(cfg, model): Performs actions after the model is loaded.\npre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\npost_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\ncreate_optimizer(cfg, trainer): Creates and returns an optimizer for training.\ncreate_lr_scheduler(cfg, trainer, optimizer): Creates and returns a learning rate scheduler.\nadd_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.\nadd_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer.\n\n\nadd_callbacks_pre_trainer\nsetup callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer.\nThis is useful for callbacks that require access to the model or trainer.\nParameters:\ncfg (dict): The configuration for the plugin.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nsetup callbacks before creating the trainer.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(cfg, trainer, optimizer)\nCreates and returns a learning rate scheduler.\nParameters:\ncfg (dict): The configuration for the plugin.\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler.\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\nParameters:\ncfg (dict): The configuration for the plugin.\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer.\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\nParameters:\ncfg (dict): The global axolotl configuration.\nReturns:\nclass: The class for the trainer.\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\nParameters:\ncfg (dict): The axolotl configuration\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins.\nIt should be a singleton so it can be accessed from anywhere in the codebase.\nAttributes:\nplugins (ListBasePlugin): A list of loaded plugins.\nMethods:\nget_instance(): Static method to get the singleton instance of PluginManager.\nregister(plugin_name: str): Registers a new plugin by its name.\npre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager.\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(cfg, trainer, optimizer)\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler, or None if none was found.\n\n\n\nintegrations.base.PluginManager.create_optimizer(cfg, trainer)\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\nReturns:\nlist[str]: A list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager.\nIf the instance doesn’t exist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The trainer class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\nParameters:\nplugin_name (str): The name of the plugin to be registered.\nReturns:\nNone\nRaises:\nImportError: If the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”.\nThis function splits the plugin name into module and class, imports the module,\nretrieves the class from the module, and creates an instance of the class.\nParameters:\nplugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nReturns:\nBasePlugin: An instance of the loaded plugin.\nRaises:\nImportError: If the plugin module cannot be imported." + }, + { + "objectID": "docs/api/integrations.base.html#classes", + "href": "docs/api/integrations.base.html#classes", + "title": "integrations.base", + "section": "", + "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\nintegrations.base.BasePlugin(self)\nBase class for all plugins. Defines the interface for plugin methods.\nAttributes:\nNone\nMethods:\nregister(cfg): Registers the plugin with the given configuration.\npre_model_load(cfg): Performs actions before the model is loaded.\npost_model_load(cfg, model): Performs actions after the model is loaded.\npre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\npost_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\ncreate_optimizer(cfg, trainer): Creates and returns an optimizer for training.\ncreate_lr_scheduler(cfg, trainer, optimizer): Creates and returns a learning rate scheduler.\nadd_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.\nadd_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer.\n\n\nadd_callbacks_pre_trainer\nsetup callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer.\nThis is useful for callbacks that require access to the model or trainer.\nParameters:\ncfg (dict): The configuration for the plugin.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nsetup callbacks before creating the trainer.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(cfg, trainer, optimizer)\nCreates and returns a learning rate scheduler.\nParameters:\ncfg (dict): The configuration for the plugin.\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler.\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\nParameters:\ncfg (dict): The configuration for the plugin.\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer.\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\nParameters:\ncfg (dict): The global axolotl configuration.\nReturns:\nclass: The class for the trainer.\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\nParameters:\ncfg (dict): The axolotl configuration\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins.\nIt should be a singleton so it can be accessed from anywhere in the codebase.\nAttributes:\nplugins (ListBasePlugin): A list of loaded plugins.\nMethods:\nget_instance(): Static method to get the singleton instance of PluginManager.\nregister(plugin_name: str): Registers a new plugin by its name.\npre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager.\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(cfg, trainer, optimizer)\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler, or None if none was found.\n\n\n\nintegrations.base.PluginManager.create_optimizer(cfg, trainer)\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\nReturns:\nlist[str]: A list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager.\nIf the instance doesn’t exist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The trainer class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\nParameters:\nplugin_name (str): The name of the plugin to be registered.\nReturns:\nNone\nRaises:\nImportError: If the plugin module cannot be imported." + }, + { + "objectID": "docs/api/integrations.base.html#functions", + "href": "docs/api/integrations.base.html#functions", + "title": "integrations.base", + "section": "", + "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”.\nThis function splits the plugin name into module and class, imports the module,\nretrieves the class from the module, and creates an instance of the class.\nParameters:\nplugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nReturns:\nBasePlugin: An instance of the loaded plugin.\nRaises:\nImportError: If the plugin module cannot be imported." + }, + { + "objectID": "docs/api/utils.bench.html", + "href": "docs/api/utils.bench.html", + "title": "utils.bench", + "section": "", + "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:" + }, + { + "objectID": "docs/api/utils.bench.html#functions", + "href": "docs/api/utils.bench.html#functions", + "title": "utils.bench", + "section": "", + "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:" + }, + { + "objectID": "docs/api/kernels.swiglu.html", + "href": "docs/api/kernels.swiglu.html", + "title": "kernels.swiglu", + "section": "", + "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]." + }, + { + "objectID": "docs/api/kernels.swiglu.html#functions", + "href": "docs/api/kernels.swiglu.html#functions", + "title": "kernels.swiglu", + "section": "", + "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]." + }, + { + "objectID": "docs/api/core.chat.format.shared.html", + "href": "docs/api/core.chat.format.shared.html", + "title": "core.chat.format.shared", + "section": "", + "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms" + }, + { + "objectID": "docs/api/integrations.cut_cross_entropy.args.html", + "href": "docs/api/integrations.cut_cross_entropy.args.html", + "title": "integrations.cut_cross_entropy.args", + "section": "", + "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy." + }, + { + "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes", + "href": "docs/api/integrations.cut_cross_entropy.args.html#classes", + "title": "integrations.cut_cross_entropy.args", + "section": "", + "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy." + }, + { + "objectID": "docs/api/core.datasets.chat.html", + "href": "docs/api/core.datasets.chat.html", + "title": "core.datasets.chat", + "section": "", + "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n self,\n data,\n model_transform,\n *args,\n message_transform=None,\n formatter=None,\n process_count=None,\n keep_in_memory=False,\n **kwargs,\n)\nTokenized chat dataset" + }, + { + "objectID": "docs/api/core.datasets.chat.html#classes", + "href": "docs/api/core.datasets.chat.html#classes", + "title": "core.datasets.chat", + "section": "", + "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n self,\n data,\n model_transform,\n *args,\n message_transform=None,\n formatter=None,\n process_count=None,\n keep_in_memory=False,\n **kwargs,\n)\nTokenized chat dataset" + }, + { + "objectID": "docs/api/utils.callbacks.lisa.html", + "href": "docs/api/utils.callbacks.lisa.html", + "title": "utils.callbacks.lisa", + "section": "", + "text": "utils.callbacks.lisa\nutils.callbacks.lisa\nmodule for LISA\nAdapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl\nArxiv: https://arxiv.org/abs/2403.17919\nLicense: Apache 2.0" + }, + { + "objectID": "docs/api/integrations.grokfast.optimizer.html", + "href": "docs/api/integrations.grokfast.optimizer.html", + "title": "integrations.grokfast.optimizer", + "section": "", + "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer" + }, + { + "objectID": "docs/api/prompt_strategies.alpaca_chat.html", + "href": "docs/api/prompt_strategies.alpaca_chat.html", + "title": "prompt_strategies.alpaca_chat", + "section": "", + "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter(self)\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter(self)\nNull Prompter with no system prompts" + }, + { + "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes", + "href": "docs/api/prompt_strategies.alpaca_chat.html#classes", + "title": "prompt_strategies.alpaca_chat", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter(self)\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter(self)\nNull Prompter with no system prompts" + }, + { + "objectID": "docs/api/prompt_strategies.alpaca_instruct.html", + "href": "docs/api/prompt_strategies.alpaca_instruct.html", + "title": "prompt_strategies.alpaca_instruct", + "section": "", + "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class" + }, + { + "objectID": "docs/api/prompt_strategies.kto.chatml.html", + "href": "docs/api/prompt_strategies.kto.chatml.html", + "title": "prompt_strategies.kto.chatml", + "section": "", + "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto" + }, + { + "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions", + "href": "docs/api/prompt_strategies.kto.chatml.html#functions", + "title": "prompt_strategies.kto.chatml", + "section": "", + "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto" + }, + { + "objectID": "docs/api/utils.schemas.integrations.html", + "href": "docs/api/utils.schemas.integrations.html", + "title": "utils.schemas.integrations", + "section": "", + "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset" + }, + { + "objectID": "docs/api/utils.schemas.integrations.html#classes", + "href": "docs/api/utils.schemas.integrations.html#classes", + "title": "utils.schemas.integrations", + "section": "", + "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset" + }, + { + "objectID": "docs/api/utils.schemas.trl.html", + "href": "docs/api/utils.schemas.trl.html", + "title": "utils.schemas.trl", + "section": "", + "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL." + }, + { + "objectID": "docs/api/utils.schemas.trl.html#classes", + "href": "docs/api/utils.schemas.trl.html#classes", + "title": "utils.schemas.trl", + "section": "", + "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL." + }, + { + "objectID": "docs/api/prompt_tokenizers.html", + "href": "docs/api/prompt_tokenizers.html", + "title": "prompt_tokenizers", + "section": "", + "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n result,\n current_len,\n res,\n labels,\n pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function" + }, + { + "objectID": "docs/api/prompt_tokenizers.html#classes", + "href": "docs/api/prompt_tokenizers.html#classes", + "title": "prompt_tokenizers", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts." + }, + { + "objectID": "docs/api/prompt_tokenizers.html#functions", + "href": "docs/api/prompt_tokenizers.html#functions", + "title": "prompt_tokenizers", + "section": "", + "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n result,\n current_len,\n res,\n labels,\n pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function" + }, + { + "objectID": "docs/api/utils.data.sft.html", + "href": "docs/api/utils.data.sft.html", + "title": "utils.data.sft", + "section": "", + "text": "utils.data.sft\nutils.data.sft\ndata handling specific to SFT" + }, + { + "objectID": "docs/api/utils.schedulers.html", + "href": "docs/api/utils.schedulers.html", + "title": "utils.schedulers", + "section": "", + "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n self,\n optimizer,\n num_steps,\n min_lr,\n max_lr,\n last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.RexLR(\n self,\n optimizer,\n max_lr,\n min_lr,\n total_steps=0,\n num_warmup_steps=0,\n last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n optimizer,\n num_warmup_steps,\n num_training_steps,\n min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -> max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -> min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n optimizer,\n num_warmup_steps,\n num_training_steps,\n num_cycles=0.5,\n last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n optimizer,\n num_warmup_steps,\n num_training_steps,\n constant_lr_ratio,\n min_lr_ratio,\n num_cycles=0.5,\n last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate. | _required_ | | num_cycles |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch |int, *optional*, defaults to -1 | The index of the last epoch when resuming training. |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule." + }, + { + "objectID": "docs/api/utils.schedulers.html#classes", + "href": "docs/api/utils.schedulers.html#classes", + "title": "utils.schedulers", + "section": "", + "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n self,\n optimizer,\n num_steps,\n min_lr,\n max_lr,\n last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.RexLR(\n self,\n optimizer,\n max_lr,\n min_lr,\n total_steps=0,\n num_warmup_steps=0,\n last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0" + }, + { + "objectID": "docs/api/utils.schedulers.html#functions", + "href": "docs/api/utils.schedulers.html#functions", + "title": "utils.schedulers", + "section": "", + "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n optimizer,\n num_warmup_steps,\n num_training_steps,\n min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -> max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -> min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n optimizer,\n num_warmup_steps,\n num_training_steps,\n num_cycles=0.5,\n last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n optimizer,\n num_warmup_steps,\n num_training_steps,\n constant_lr_ratio,\n min_lr_ratio,\n num_cycles=0.5,\n last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate. | _required_ | | num_cycles |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch |int, *optional*, defaults to -1 | The index of the last epoch when resuming training. |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule." + }, + { + "objectID": "docs/api/utils.chat_templates.html", + "href": "docs/api/utils.chat_templates.html", + "title": "utils.chat_templates", + "section": "", + "text": "utils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chat_template\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\nregister_chat_template\nRegisters chat templates.\n\n\n\n\n\nutils.chat_templates.get_chat_template(\n user_choice,\n jinja_template=None,\n tokenizer=None,\n)\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nuser_choice\nstr\nThe user’s choice of template.\nrequired\n\n\njinja_template\nOptional[str]\nThe jinja template string. Defaults to None.\nNone\n\n\ntokenizer\nOptional[PreTrainedTokenizerBase]\nThe tokenizer. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nstr\nstr\nThe chosen template string.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the user_choice is not found in the templates.\n\n\n\n\n\n\n\nutils.chat_templates.register_chat_template(template_name, chat_template)\nRegisters chat templates.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntemplate_name\nstr\nThe name of the template.\nrequired\n\n\nchat_template\nstr\nThe template string.\nrequired" + }, + { + "objectID": "docs/api/utils.chat_templates.html#functions", + "href": "docs/api/utils.chat_templates.html#functions", + "title": "utils.chat_templates", + "section": "", + "text": "Name\nDescription\n\n\n\n\nget_chat_template\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\nregister_chat_template\nRegisters chat templates.\n\n\n\n\n\nutils.chat_templates.get_chat_template(\n user_choice,\n jinja_template=None,\n tokenizer=None,\n)\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nuser_choice\nstr\nThe user’s choice of template.\nrequired\n\n\njinja_template\nOptional[str]\nThe jinja template string. Defaults to None.\nNone\n\n\ntokenizer\nOptional[PreTrainedTokenizerBase]\nThe tokenizer. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nstr\nstr\nThe chosen template string.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the user_choice is not found in the templates.\n\n\n\n\n\n\n\nutils.chat_templates.register_chat_template(template_name, chat_template)\nRegisters chat templates.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntemplate_name\nstr\nThe name of the template.\nrequired\n\n\nchat_template\nstr\nThe template string.\nrequired" + }, + { + "objectID": "docs/api/utils.models.html", + "href": "docs/api/utils.models.html", + "title": "utils.models", + "section": "", + "text": "utils.models\nModule for models and model loading\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nutils.models.ModelLoader(\n self,\n cfg,\n tokenizer,\n *,\n processor=None,\n inference=False,\n reference_model=False,\n **kwargs,\n)\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_llama_derived_model\nModify all llama derived models in one block\n\n\npatch_loss_llama\nPatch loss functions and other optimizations\n\n\nset_attention_config\nsample packing uses custom FA2 patch\n\n\nset_auto_model_loader\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n\n\n\n\n\nutils.models.ModelLoader.patch_llama_derived_model()\nModify all llama derived models in one block\n\n\n\nutils.models.ModelLoader.patch_loss_llama()\nPatch loss functions and other optimizations\n\n\n\nutils.models.ModelLoader.set_attention_config()\nsample packing uses custom FA2 patch\n\n\n\nutils.models.ModelLoader.set_auto_model_loader()\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n(set at __init__). When using a multimodal model, self.auto_model_loader\nshould be set according to the type of the model.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_module_class_from_name\nGets a class from a module by its name.\n\n\nload_model\nLoad a model for a given configuration and tokenizer.\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nutils.models.get_module_class_from_name(module, name)\nGets a class from a module by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodule\ntorch.nn.Module\nThe module to get the class from.\nrequired\n\n\nname\nstr\nThe name of the class.\nrequired\n\n\n\n\n\n\n\nutils.models.load_model(\n cfg,\n tokenizer,\n *,\n processor=None,\n inference=False,\n reference_model=False,\n **kwargs,\n)\nLoad a model for a given configuration and tokenizer.\n\n\n\nutils.models.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nutils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\nDict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941\n\n\n\n\nutils.models.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nutils.models.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue" + }, + { + "objectID": "docs/api/utils.models.html#classes", + "href": "docs/api/utils.models.html#classes", + "title": "utils.models", + "section": "", + "text": "Name\nDescription\n\n\n\n\nModelLoader\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nutils.models.ModelLoader(\n self,\n cfg,\n tokenizer,\n *,\n processor=None,\n inference=False,\n reference_model=False,\n **kwargs,\n)\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_llama_derived_model\nModify all llama derived models in one block\n\n\npatch_loss_llama\nPatch loss functions and other optimizations\n\n\nset_attention_config\nsample packing uses custom FA2 patch\n\n\nset_auto_model_loader\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n\n\n\n\n\nutils.models.ModelLoader.patch_llama_derived_model()\nModify all llama derived models in one block\n\n\n\nutils.models.ModelLoader.patch_loss_llama()\nPatch loss functions and other optimizations\n\n\n\nutils.models.ModelLoader.set_attention_config()\nsample packing uses custom FA2 patch\n\n\n\nutils.models.ModelLoader.set_auto_model_loader()\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n(set at __init__). When using a multimodal model, self.auto_model_loader\nshould be set according to the type of the model." + }, + { + "objectID": "docs/api/utils.models.html#functions", + "href": "docs/api/utils.models.html#functions", + "title": "utils.models", + "section": "", + "text": "Name\nDescription\n\n\n\n\nget_module_class_from_name\nGets a class from a module by its name.\n\n\nload_model\nLoad a model for a given configuration and tokenizer.\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nutils.models.get_module_class_from_name(module, name)\nGets a class from a module by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodule\ntorch.nn.Module\nThe module to get the class from.\nrequired\n\n\nname\nstr\nThe name of the class.\nrequired\n\n\n\n\n\n\n\nutils.models.load_model(\n cfg,\n tokenizer,\n *,\n processor=None,\n inference=False,\n reference_model=False,\n **kwargs,\n)\nLoad a model for a given configuration and tokenizer.\n\n\n\nutils.models.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nutils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\nDict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941\n\n\n\n\nutils.models.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nutils.models.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.chatml.html", + "href": "docs/api/prompt_strategies.dpo.chatml.html", + "title": "prompt_strategies.dpo.chatml", + "section": "", + "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions", + "href": "docs/api/prompt_strategies.dpo.chatml.html#functions", + "title": "prompt_strategies.dpo.chatml", + "section": "", + "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations" + }, + { + "objectID": "docs/api/utils.distributed.html", + "href": "docs/api/utils.distributed.html", + "title": "utils.distributed", + "section": "", + "text": "utils.distributed\nutility helpers for distributed checks\n\n\n\n\n\nName\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process.\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\nzero_only\nContext manager that only runs the enclosed block on the main rank.\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process.\nIf not in distributed mode, always return True.\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\nutils.distributed.zero_only()\nContext manager that only runs the enclosed block on the main rank." + }, + { + "objectID": "docs/api/utils.distributed.html#functions", + "href": "docs/api/utils.distributed.html#functions", + "title": "utils.distributed", + "section": "", + "text": "Name\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process.\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\nzero_only\nContext manager that only runs the enclosed block on the main rank.\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process.\nIf not in distributed mode, always return True.\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\nutils.distributed.zero_only()\nContext manager that only runs the enclosed block on the main rank." + }, + { + "objectID": "docs/api/monkeypatch.utils.html", + "href": "docs/api/monkeypatch.utils.html", + "title": "monkeypatch.utils", + "section": "", + "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking." + }, + { + "objectID": "docs/api/monkeypatch.utils.html#functions", + "href": "docs/api/monkeypatch.utils.html#functions", + "title": "monkeypatch.utils", + "section": "", + "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking." + }, + { + "objectID": "docs/api/utils.schemas.utils.html", + "href": "docs/api/utils.schemas.utils.html", + "title": "utils.schemas.utils", + "section": "", + "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings" + }, + { + "objectID": "docs/api/utils.schemas.utils.html#functions", + "href": "docs/api/utils.schemas.utils.html#functions", + "title": "utils.schemas.utils", + "section": "", + "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings" + }, + { + "objectID": "docs/api/monkeypatch.llama_expand_mask.html", + "href": "docs/api/monkeypatch.llama_expand_mask.html", + "title": "monkeypatch.llama_expand_mask", + "section": "", + "text": "monkeypatch.llama_expand_mask\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf" + }, + { + "objectID": "docs/api/common.datasets.html", + "href": "docs/api/common.datasets.html", + "title": "common.datasets", + "section": "", + "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n self,\n train_dataset,\n eval_dataset=None,\n total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_dataset. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nUnion[PreprocessCliArgs, TrainerCliArgs]\nCommand-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.load_prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nUnion[PreprocessCliArgs, TrainerCliArgs]\nCommand-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples from dataset.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nDataset\nDataset.\nrequired\n\n\nnum_samples\nint\nNumber of samples to return.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDataset\nRandom sample (with replacement) of examples in dataset." + }, + { + "objectID": "docs/api/common.datasets.html#classes", + "href": "docs/api/common.datasets.html#classes", + "title": "common.datasets", + "section": "", + "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n self,\n train_dataset,\n eval_dataset=None,\n total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata." + }, + { + "objectID": "docs/api/common.datasets.html#functions", + "href": "docs/api/common.datasets.html#functions", + "title": "common.datasets", + "section": "", + "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_dataset. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nUnion[PreprocessCliArgs, TrainerCliArgs]\nCommand-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.load_prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nUnion[PreprocessCliArgs, TrainerCliArgs]\nCommand-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples from dataset.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nDataset\nDataset.\nrequired\n\n\nnum_samples\nint\nNumber of samples to return.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDataset\nRandom sample (with replacement) of examples in dataset." + }, + { + "objectID": "docs/api/logging_config.html", + "href": "docs/api/logging_config.html", + "title": "logging_config", + "section": "", + "text": "logging_config\nCommon logging module for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging" + }, + { + "objectID": "docs/api/logging_config.html#classes", + "href": "docs/api/logging_config.html#classes", + "title": "logging_config", + "section": "", + "text": "Name\nDescription\n\n\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type" + }, + { + "objectID": "docs/api/logging_config.html#functions", + "href": "docs/api/logging_config.html#functions", + "title": "logging_config", + "section": "", + "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging" + }, + { + "objectID": "docs/api/kernels.quantize.html", + "href": "docs/api/kernels.quantize.html", + "title": "kernels.quantize", + "section": "", + "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (>0.43.3)." + }, + { + "objectID": "docs/api/kernels.quantize.html#functions", + "href": "docs/api/kernels.quantize.html#functions", + "title": "kernels.quantize", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (>0.43.3)." + }, + { + "objectID": "docs/api/utils.schemas.model.html", + "href": "docs/api/utils.schemas.model.html", + "title": "utils.schemas.model", + "section": "", + "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset" + }, + { + "objectID": "docs/api/utils.schemas.model.html#classes", + "href": "docs/api/utils.schemas.model.html#classes", + "title": "utils.schemas.model", + "section": "", + "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset" + }, + { + "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html", + "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html", + "title": "monkeypatch.stablelm_attn_hijack_flash", + "section": "", + "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input." + }, + { + "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions", + "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions", + "title": "monkeypatch.stablelm_attn_hijack_flash", + "section": "", + "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input." + }, + { + "objectID": "docs/api/monkeypatch.mixtral.html", + "href": "docs/api/monkeypatch.mixtral.html", + "title": "monkeypatch.mixtral", + "section": "", + "text": "monkeypatch.mixtral\nmonkeypatch.mixtral\nPatches to support multipack for mixtral" + }, + { + "objectID": "docs/api/utils.tokenization.html", + "href": "docs/api/utils.tokenization.html", + "title": "utils.tokenization", + "section": "", + "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n decoded_token,\n encoded_token,\n color,\n text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n tokens,\n color,\n tokenizer,\n text_only,\n)\nHelper function to process and color tokens." + }, + { + "objectID": "docs/api/utils.tokenization.html#functions", + "href": "docs/api/utils.tokenization.html#functions", + "title": "utils.tokenization", + "section": "", + "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n decoded_token,\n encoded_token,\n color,\n text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n tokens,\n color,\n tokenizer,\n text_only,\n)\nHelper function to process and color tokens." + }, + { + "objectID": "docs/api/integrations.kd.trainer.html", + "href": "docs/api/integrations.kd.trainer.html", + "title": "integrations.kd.trainer", + "section": "", + "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(\n self,\n *_args,\n bench_data_collator=None,\n eval_data_collator=None,\n dataset_tags=None,\n **kwargs,\n)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n model,\n inputs,\n return_outputs=False,\n num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior." + }, + { + "objectID": "docs/api/integrations.kd.trainer.html#classes", + "href": "docs/api/integrations.kd.trainer.html#classes", + "title": "integrations.kd.trainer", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(\n self,\n *_args,\n bench_data_collator=None,\n eval_data_collator=None,\n dataset_tags=None,\n **kwargs,\n)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n model,\n inputs,\n return_outputs=False,\n num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior." + }, + { + "objectID": "docs/api/utils.schemas.datasets.html", + "href": "docs/api/utils.schemas.datasets.html", + "title": "utils.schemas.datasets", + "section": "", + "text": "utils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types" + }, + { + "objectID": "docs/api/utils.schemas.datasets.html#classes", + "href": "docs/api/utils.schemas.datasets.html#classes", + "title": "utils.schemas.datasets", + "section": "", + "text": "Name\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types" + }, + { + "objectID": "docs/api/utils.collators.core.html", + "href": "docs/api/utils.collators.core.html", + "title": "utils.collators.core", + "section": "", + "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants" + }, + { + "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html", + "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html", + "title": "monkeypatch.btlm_attn_hijack_flash", + "section": "", + "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model" + }, + { + "objectID": "docs/api/utils.optimizers.adopt.html", + "href": "docs/api/utils.optimizers.adopt.html", + "title": "utils.optimizers.adopt", + "section": "", + "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n params,\n grads,\n exp_avgs,\n exp_avg_sqs,\n state_steps,\n foreach=None,\n capturable=False,\n differentiable=False,\n fused=None,\n grad_scale=None,\n found_inf=None,\n has_complex=False,\n *,\n beta1,\n beta2,\n lr,\n clip_lambda,\n weight_decay,\n decouple,\n eps,\n maximize,\n)\nFunctional API that performs ADOPT algorithm computation." + }, + { + "objectID": "docs/api/utils.optimizers.adopt.html#functions", + "href": "docs/api/utils.optimizers.adopt.html#functions", + "title": "utils.optimizers.adopt", + "section": "", + "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n params,\n grads,\n exp_avgs,\n exp_avg_sqs,\n state_steps,\n foreach=None,\n capturable=False,\n differentiable=False,\n fused=None,\n grad_scale=None,\n found_inf=None,\n has_complex=False,\n *,\n beta1,\n beta2,\n lr,\n clip_lambda,\n weight_decay,\n decouple,\n eps,\n maximize,\n)\nFunctional API that performs ADOPT algorithm computation." + }, + { + "objectID": "docs/api/prompt_strategies.input_output.html", + "href": "docs/api/prompt_strategies.input_output.html", + "title": "prompt_strategies.input_output", + "section": "", + "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n self,\n *args,\n eos_token=None,\n **kwargs,\n)\nPrompt Strategy class for input/output pairs" + }, + { + "objectID": "docs/api/prompt_strategies.input_output.html#classes", + "href": "docs/api/prompt_strategies.input_output.html#classes", + "title": "prompt_strategies.input_output", + "section": "", + "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n self,\n *args,\n eos_token=None,\n **kwargs,\n)\nPrompt Strategy class for input/output pairs" + }, + { + "objectID": "docs/api/index.html", + "href": "docs/api/index.html", + "title": "API Reference", + "section": "", + "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.trainer_builder\nBuilder for the training args and trainer\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils\nUtility methods for axolotl CLI.\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL PPO trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainer\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.models\nModule for models and model loading\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.lora_embeddings\nhelpers for lora embeddings\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nutility helpers for distributed checks\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\ndata handling specific to SFT\n\n\nutils.gradient_checkpointing.unsloth\nUnsloth checkpointing\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences. Also\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks" + }, + { + "objectID": "docs/api/index.html#core", + "href": "docs/api/index.html#core", + "title": "API Reference", + "section": "", + "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.trainer_builder\nBuilder for the training args and trainer\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat." + }, + { + "objectID": "docs/api/index.html#cli", + "href": "docs/api/index.html#cli", + "title": "API Reference", + "section": "", + "text": "Command-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils\nUtility methods for axolotl CLI.\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI" + }, + { + "objectID": "docs/api/index.html#trainers", + "href": "docs/api/index.html#trainers", + "title": "API Reference", + "section": "", + "text": "Training implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL PPO trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainer" + }, + { + "objectID": "docs/api/index.html#prompt-strategies", + "href": "docs/api/index.html#prompt-strategies", + "title": "API Reference", + "section": "", + "text": "Prompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template" + }, + { + "objectID": "docs/api/index.html#kernels", + "href": "docs/api/index.html#kernels", + "title": "API Reference", + "section": "", + "text": "Low-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules." + }, + { + "objectID": "docs/api/index.html#monkeypatches", + "href": "docs/api/index.html#monkeypatches", + "title": "API Reference", + "section": "", + "text": "Runtime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral" + }, + { + "objectID": "docs/api/index.html#utils", + "href": "docs/api/index.html#utils", + "title": "API Reference", + "section": "", + "text": "Utility functions\n\n\n\nutils.models\nModule for models and model loading\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.lora_embeddings\nhelpers for lora embeddings\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nutility helpers for distributed checks\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\ndata handling specific to SFT\n\n\nutils.gradient_checkpointing.unsloth\nUnsloth checkpointing" + }, + { + "objectID": "docs/api/index.html#schemas", + "href": "docs/api/index.html#schemas", + "title": "API Reference", + "section": "", + "text": "Pydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models" + }, + { + "objectID": "docs/api/index.html#integrations", + "href": "docs/api/index.html#integrations", + "title": "API Reference", + "section": "", + "text": "Third-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments." + }, + { + "objectID": "docs/api/index.html#common", + "href": "docs/api/index.html#common", + "title": "API Reference", + "section": "", + "text": "Common utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities." + }, + { + "objectID": "docs/api/index.html#models", + "href": "docs/api/index.html#models", + "title": "API Reference", + "section": "", + "text": "Custom model implementations\n\n\n\nmodels.mamba.modeling_mamba" + }, + { + "objectID": "docs/api/index.html#data-processing", + "href": "docs/api/index.html#data-processing", + "title": "API Reference", + "section": "", + "text": "Data processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences. Also\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler" + }, + { + "objectID": "docs/api/index.html#callbacks", + "href": "docs/api/index.html#callbacks", + "title": "API Reference", + "section": "", + "text": "Training callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks" + }, + { + "objectID": "docs/api/cli.cloud.modal_.html", + "href": "docs/api/cli.cloud.modal_.html", + "title": "cli.cloud.modal_", + "section": "", + "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(self, config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success." + }, + { + "objectID": "docs/api/cli.cloud.modal_.html#classes", + "href": "docs/api/cli.cloud.modal_.html#classes", + "title": "cli.cloud.modal_", + "section": "", + "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(self, config, app=None)\nModal Cloud implementation." + }, + { + "objectID": "docs/api/cli.cloud.modal_.html#functions", + "href": "docs/api/cli.cloud.modal_.html#functions", + "title": "cli.cloud.modal_", + "section": "", + "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success." + }, + { + "objectID": "docs/api/prompt_strategies.dpo.llama3.html", + "href": "docs/api/prompt_strategies.dpo.llama3.html", + "title": "prompt_strategies.dpo.llama3", + "section": "", + "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions", + "href": "docs/api/prompt_strategies.dpo.llama3.html#functions", + "title": "prompt_strategies.dpo.llama3", + "section": "", + "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations" + }, + { + "objectID": "docs/api/cli.train.html", + "href": "docs/api/cli.train.html", + "title": "cli.train", + "section": "", + "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired" + }, + { + "objectID": "docs/api/cli.train.html#functions", + "href": "docs/api/cli.train.html#functions", + "title": "cli.train", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired" + }, + { + "objectID": "docs/api/core.trainer_builder.html", + "href": "docs/api/core.trainer_builder.html", + "title": "core.trainer_builder", + "section": "", + "text": "core.trainer_builder\nBuilder for the training args and trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\nHFPPOTrainerBuilder\nHF Factory class for PPO Trainer\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.trainer_builder.HFCausalTrainerBuilder(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL.\n\n\n\ncore.trainer_builder.HFPPOTrainerBuilder(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nHF Factory class for PPO Trainer\n\n\n\ncore.trainer_builder.HFRLTrainerBuilder(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\ncore.trainer_builder.TrainerBuilderBase(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(\n trainer,\n)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer" + }, + { + "objectID": "docs/api/core.trainer_builder.html#classes", + "href": "docs/api/core.trainer_builder.html#classes", + "title": "core.trainer_builder", + "section": "", + "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\nHFPPOTrainerBuilder\nHF Factory class for PPO Trainer\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.trainer_builder.HFCausalTrainerBuilder(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL.\n\n\n\ncore.trainer_builder.HFPPOTrainerBuilder(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nHF Factory class for PPO Trainer\n\n\n\ncore.trainer_builder.HFRLTrainerBuilder(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\ncore.trainer_builder.TrainerBuilderBase(\n self,\n cfg,\n model,\n tokenizer,\n processor=None,\n)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(\n trainer,\n)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer" + }, + { + "objectID": "docs/api/utils.callbacks.perplexity.html", + "href": "docs/api/utils.callbacks.perplexity.html", + "title": "utils.callbacks.perplexity", + "section": "", + "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence." + }, + { + "objectID": "docs/api/utils.callbacks.perplexity.html#classes", + "href": "docs/api/utils.callbacks.perplexity.html#classes", + "title": "utils.callbacks.perplexity", + "section": "", + "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence." + }, { "objectID": "docs/getting-started.html", "href": "docs/getting-started.html", @@ -357,7 +1288,7 @@ "href": "docs/getting-started.html#sec-quick-example", "title": "Quickstart", "section": "1 Quick Example", - "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs. Assuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.", + "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.", "crumbs": [ "Getting Started", "Quickstart" @@ -379,7 +1310,7 @@ "href": "docs/getting-started.html#sec-custom", "title": "Quickstart", "section": "3 Your First Custom Training", - "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n - path: my_data.jsonl # Your local data file\n type: alpaca # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using the alpaca dataset format, which has the following format:\n{\n \"instruction\": \"Write a description of alpacas.\",\n \"input\": \"\",\n \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to format them.\n\nPrepare your JSONL data in the specified format (in this case, the expected `alpaca format):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml", + "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n - path: my_data.jsonl # Your local data file\n type: alpaca # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n \"instruction\": \"Write a description of alpacas.\",\n \"input\": \"\",\n \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected `alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml", "crumbs": [ "Getting Started", "Quickstart" @@ -478,7 +1409,7 @@ "href": "docs/multipack.html", "title": "Multipack (Sample Packing)", "section": "", - "text": "Because Flash Attention simply drops the attention mask, we do not need to construct a 4d attention mask. We only need to concatenate the sequences into a single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4, each character represents 256 tokens X represents a padding token\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B ]\n C C C C C C C ]\n D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B X X X X X X ]\n C C C C C C C X X X X ]\n D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n B C C C C C C C D D D D E E E E\n E E E E F F F F F G G G H H H H\n I I I J J J J K K K K K L L L X ]]\ncu_seqlens: [[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]", + "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B ]\n C C C C C C C ]\n D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B X X X X X X ]\n C C C C C C C X X X X ]\n D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n B C C C C C C C D D D D E E E E\n E E E E F F F F F G G G H H H H\n I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]", "crumbs": [ "Core Concepts", "Multipack (Sample Packing)" @@ -489,7 +1420,7 @@ "href": "docs/multipack.html#visualization-of-multipack-with-flash-attention", "title": "Multipack (Sample Packing)", "section": "", - "text": "Because Flash Attention simply drops the attention mask, we do not need to construct a 4d attention mask. We only need to concatenate the sequences into a single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4, each character represents 256 tokens X represents a padding token\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B ]\n C C C C C C C ]\n D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B X X X X X X ]\n C C C C C C C X X X X ]\n D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n B C C C C C C C D D D D E E E E\n E E E E F F F F F G G G H H H H\n I I I J J J J K K K K K L L L X ]]\ncu_seqlens: [[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]", + "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B ]\n C C C C C C C ]\n D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n B B B B B B X X X X X X ]\n C C C C C C C X X X X ]\n D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n B C C C C C C C D D D D E E E E\n E E E E F F F F F G G G H H H H\n I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]", "crumbs": [ "Core Concepts", "Multipack (Sample Packing)" @@ -500,235 +1431,74 @@ "href": "docs/multipack.html#multipack-without-flash-attention", "title": "Multipack (Sample Packing)", "section": "Multipack without Flash Attention", - "text": "Multipack without Flash Attention\nMultipack can still be achieved without Flash attention, but with lower packing efficiency as we are not able to join multiple batches into a single batch due to context length limits without flash attention. We can use either Pytorch’s Scaled Dot Product Attention implementation or native Pytorch attention implementation along with 4d attention masks to pack sequences together and avoid cross attention.", + "text": "Multipack without Flash Attention\nMultipack can still be achieved without Flash attention, but with lower packing\nefficiency as we are not able to join multiple batches into a single batch due to\ncontext length limits without flash attention. We can use either Pytorch’s Scaled\nDot Product Attention implementation or native Pytorch attention implementation\nalong with 4d attention masks\nto pack sequences together and avoid cross attention.", "crumbs": [ "Core Concepts", "Multipack (Sample Packing)" ] }, { - "objectID": "docs/debugging.html", - "href": "docs/debugging.html", - "title": "Debugging", + "objectID": "docs/sequence_parallelism.html", + "href": "docs/sequence_parallelism.html", + "title": "Sequence Parallelism", "section": "", - "text": "This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.", - "crumbs": [ - "Troubleshooting", - "Debugging" - ] + "text": "Sequence parallelism is a technique that splits sequences across multiple GPUs,\nallowing you to train with very long sequences that wouldn’t fit on a single GPU. Each\nGPU processes a different portion of the sequence, and the results are aggregated\nthrough a ring communication pattern.\n\n\nUse sequence parallelism when:\n\nYou need to train with sequence lengths that don’t fit into a single GPU’s memory\nYou have multiple GPUs available\nYou’re experiencing OOM (Out Of Memory) errors with long sequences\n\n\n\n\nTo enable sequence parallelism, add the following to your configuration file:\n# Set to a divisor (> 1) of the number of GPUs available\nsequence_parallel_degree: 4 # Split sequences across 4 GPUs\nThe sequence_parallel_degree should be a divisor of the total number of GPUs. For example:\n\nWith 8 GPUs, valid values would be 2, 4, or 8\nWith 4 GPUs, valid values would be 2 or 4\n\n\n\n\nWhen sequence parallelism is enabled:\n\nEach sequence is divided into equal chunks across the GPUs in a sequence parallel group\nThe data collator handles the chunking of input_ids, attention_mask, labels, and position_ids\nPosition IDs are adjusted to maintain proper relative positions, especially for packed sequences\nThe trainer uses special ring communication patterns for attention operations\n\n\n\n\nTo use sequence parallelism, you need:\n\nMultiple GPUs (at least 2)\nThe ring-flash-attn package. Install with:\n\npip install axolotl[ring-flash-attn] (preferred)\npip install ring-flash-attn>=0.1.4\n\n\n\n\n\n\nFlash attention must be enabled for this to work (flash_attention: true in config YAML)\nMay have a small performance overhead due to communication between GPUs\n\n\n\n\n# Example config with sequence parallelism\nbase_model: meta-llama/Llama-3-8B-Instruct\nsequence_len: 8192\nsequence_parallel_degree: 2 # Split each sequence into 4 parts\nflash_attention: true # Required with sequence parallelism\n...\nThis will train the Llama 3 8B model with 8K context length, with each sequence split\ninto 2 subsequences of length 4096 across 2 GPUs.\n\n\n\nSequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:\n\nSamples are first packed together\nThe packed sequences are then divided across GPUs in the sequence parallel group\nPosition IDs are automatically adjusted to maintain proper relative positions\n\n\n\n\nWhen using sequence parallelism, your effective global batch size is divided by the sequence_parallel_degree. This happens because:\n\nEach group of sequence_parallel_degree GPUs works on the same batch (just different parts of each sequence)\nThe number of batches processed per step decreases\n\nFor example:\n- With 8 GPUs and no sequence parallelism: 8 different batches processed per step\n- With 8 GPUs and sequence_parallel_degree=4: Only 2 different batches processed per step (each split across 4 GPUs)\n- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4" }, { - "objectID": "docs/debugging.html#table-of-contents", - "href": "docs/debugging.html#table-of-contents", - "title": "Debugging", - "section": "Table of Contents", - "text": "Table of Contents\n\nGeneral Tips\nDebugging with VSCode\n\nBackground\nConfiguration\nCustomizing your debugger\nVideo Tutorial\n\nDebugging With Docker\n\nSetup\nAttach To Container\nVideo - Attaching To Docker On Remote Host", - "crumbs": [ - "Troubleshooting", - "Debugging" - ] - }, - { - "objectID": "docs/debugging.html#general-tips", - "href": "docs/debugging.html#general-tips", - "title": "Debugging", - "section": "General Tips", - "text": "General Tips\nWhile debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:\n\n[!Important] All of these tips are incorporated into the example configuration for debugging with VSCode below.\n\n\nMake sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.\nEliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:\n\nSet CUDA_VISIBLE_DEVICES to a single GPU, ex: export CUDA_VISIBLE_DEVICES=0.\nSet dataset_processes: 1 in your axolotl config or run the training command with --dataset_processes=1.\n\nUse a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):\ndatasets:\n ...\n shards: 20\nUse a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.\nMinimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.\n\nmicro_batch_size: 1\nmax_steps: 1\nval_set_size: 0\n\nClear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.\n\nData preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in dataset_prepared_path: in your axolotl config. If you didn’t set this value, the default is last_run_prepared.\nHF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache HuggingFace cache, by deleting the appropriate ~/.cache/huggingface/datasets/... folder(s).\nThe recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.", - "crumbs": [ - "Troubleshooting", - "Debugging" - ] - }, - { - "objectID": "docs/debugging.html#debugging-with-vscode", - "href": "docs/debugging.html#debugging-with-vscode", - "title": "Debugging", - "section": "Debugging with VSCode", - "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n type: chat_template\n\n[!Important] If you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip] If you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n \"version\": \"0.2.0\",\n \"configurations\": [\n {\n \"name\": \"Debug axolotl prompt - chat_template\",\n \"type\": \"python\",\n \"module\": \"accelerate.commands.launch\",\n \"request\": \"launch\",\n \"args\": [\n \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n // The flags below simplify debugging by overriding the axolotl config\n // with the debugging tips above. Modify as needed.\n \"--dataset_processes=1\", // limits data preprocessing to one process\n \"--max_steps=1\", // limits training to just one step\n \"--batch_size=1\", // minimizes batch size\n \"--micro_batch_size=1\", // minimizes batch size\n \"--val_set_size=0\", // disables validation\n \"--sample_packing=False\", // disables sample packing which is necessary for small datasets\n \"--eval_sample_packing=False\",// disables sample packing on eval set\n \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n ],\n \"console\": \"integratedTerminal\", // show output in the integrated terminal\n \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n \"justMyCode\": true, // step through only axolotl code\n \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\", // Since we aren't doing distributed training, we need to limit to one GPU\n \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n }\n ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip] You may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n \"version\": \"2.0.0\",\n \"tasks\": [\n // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n {\n \"label\": \"delete-outputs\",\n \"type\": \"shell\",\n \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n \"problemMatcher\": []\n },\n // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n {\n \"label\": \"delete-temp-hf-dataset-cache\",\n \"type\": \"shell\",\n \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n \"problemMatcher\": []\n },\n // this task combines the two tasks above\n {\n \"label\": \"cleanup-for-dataprep\",\n \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n }\n ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode", - "crumbs": [ - "Troubleshooting", - "Debugging" - ] - }, - { - "objectID": "docs/debugging.html#debugging-with-docker", - "href": "docs/debugging.html#debugging-with-docker", - "title": "Debugging", - "section": "Debugging With Docker", - "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip] If you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip] To understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host", - "crumbs": [ - "Troubleshooting", - "Debugging" - ] - }, - { - "objectID": "docs/debugging.html#footnotes", - "href": "docs/debugging.html#footnotes", - "title": "Debugging", - "section": "Footnotes", - "text": "Footnotes\n\n\nThe config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎\nMany of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎", - "crumbs": [ - "Troubleshooting", - "Debugging" - ] - }, - { - "objectID": "docs/lr_groups.html", - "href": "docs/lr_groups.html", - "title": "Learning Rate Groups", + "objectID": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism", + "href": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism", + "title": "Sequence Parallelism", "section": "", - "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of modules in a model.", - "crumbs": [ - "How To Guides", - "Learning Rate Groups" - ] + "text": "Use sequence parallelism when:\n\nYou need to train with sequence lengths that don’t fit into a single GPU’s memory\nYou have multiple GPUs available\nYou’re experiencing OOM (Out Of Memory) errors with long sequences" }, { - "objectID": "docs/lr_groups.html#background", - "href": "docs/lr_groups.html#background", - "title": "Learning Rate Groups", + "objectID": "docs/sequence_parallelism.html#configuration", + "href": "docs/sequence_parallelism.html#configuration", + "title": "Sequence Parallelism", "section": "", - "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of modules in a model.", - "crumbs": [ - "How To Guides", - "Learning Rate Groups" - ] + "text": "To enable sequence parallelism, add the following to your configuration file:\n# Set to a divisor (> 1) of the number of GPUs available\nsequence_parallel_degree: 4 # Split sequences across 4 GPUs\nThe sequence_parallel_degree should be a divisor of the total number of GPUs. For example:\n\nWith 8 GPUs, valid values would be 2, 4, or 8\nWith 4 GPUs, valid values would be 2 or 4" }, { - "objectID": "docs/lr_groups.html#example", - "href": "docs/lr_groups.html#example", - "title": "Learning Rate Groups", - "section": "Example", - "text": "Example\nlr_groups:\n - name: o_proj\n modules:\n - self_attn.o_proj.weight\n lr: 1e-6\n - name: q_proj\n modules:\n - model.layers.2.self_attn.q_proj.weight\n lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate of 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s self attention q_proj module.", - "crumbs": [ - "How To Guides", - "Learning Rate Groups" - ] - }, - { - "objectID": "TODO.html", - "href": "TODO.html", - "title": "todo list", + "objectID": "docs/sequence_parallelism.html#implementation-details", + "href": "docs/sequence_parallelism.html#implementation-details", + "title": "Sequence Parallelism", "section": "", - "text": "[] Validation of parameters for combinations that won’t work\n\n\n\n\nFSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload" + "text": "When sequence parallelism is enabled:\n\nEach sequence is divided into equal chunks across the GPUs in a sequence parallel group\nThe data collator handles the chunking of input_ids, attention_mask, labels, and position_ids\nPosition IDs are adjusted to maintain proper relative positions, especially for packed sequences\nThe trainer uses special ring communication patterns for attention operations" }, { - "objectID": "TODO.html#things-that-are-known-not-to-work", - "href": "TODO.html#things-that-are-known-not-to-work", - "title": "todo list", + "objectID": "docs/sequence_parallelism.html#requirements", + "href": "docs/sequence_parallelism.html#requirements", + "title": "Sequence Parallelism", "section": "", - "text": "FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload" + "text": "To use sequence parallelism, you need:\n\nMultiple GPUs (at least 2)\nThe ring-flash-attn package. Install with:\n\npip install axolotl[ring-flash-attn] (preferred)\npip install ring-flash-attn>=0.1.4" }, { - "objectID": "src/axolotl/integrations/LICENSE.html", - "href": "src/axolotl/integrations/LICENSE.html", - "title": "Axolotl", + "objectID": "docs/sequence_parallelism.html#limitations", + "href": "docs/sequence_parallelism.html#limitations", + "title": "Sequence Parallelism", "section": "", - "text": "AXOLOTL COMMUNITY LICENSE AGREEMENT\nThis Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms and conditions set forth in this Agreement.\n\nDefinitions 1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement. 1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl, which may be licensed separately by their respective authors and/or licensors. 1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which permits Plugin Integrations to integrate with the Axolotl service.\nGrant of License 2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge, publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions: - Licensee must comply with all the terms and conditions of this Agreement. - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial portions of the Software. 2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.\nRestrictions 3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for free or for sale any services, platform, or equivalent to third parties for the purposes of allowing such third parties to fine-tune artificial intelligence models. 3.2 Licensee shall not: - Use the Software for any illegal or unauthorized purpose. - Reverse engineer, decompile, or disassemble the Software. - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software. - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the Software or interfere with any third-party use of the Software. 3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.\nIntellectual Property Rights 4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to Licensee.\nDisclaimer of Warranty 5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\nTermination 6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any copies in its possession.\nGoverning Law 7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California, without regards to conflicts of laws provisions thereof.\nEntire Agreement 8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms on a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be bound by the terms and conditions of this Agreement.\n\nThis Agreement was last updated on August 23, 2024." + "text": "Flash attention must be enabled for this to work (flash_attention: true in config YAML)\nMay have a small performance overhead due to communication between GPUs" }, { - "objectID": "index.html", - "href": "index.html", - "title": "Axolotl", + "objectID": "docs/sequence_parallelism.html#example", + "href": "docs/sequence_parallelism.html#example", + "title": "Sequence Parallelism", "section": "", - "text": "Axolotl is a tool designed to streamline post-training for various AI models. Post-training refers to any modifications or additional training performed on pre-trained models - including full model fine-tuning, parameter-efficient tuning (like LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment techniques. With support for multiple model architectures and training configurations, Axolotl makes it easy to get started with these techniques.\nAxolotl is designed to work with YAML config files that contain everything you need to preprocess a dataset, train or fine-tune a model, run model inference or evaluation, and much more.\nFeatures:", - "crumbs": [ - "Home" - ] + "text": "# Example config with sequence parallelism\nbase_model: meta-llama/Llama-3-8B-Instruct\nsequence_len: 8192\nsequence_parallel_degree: 2 # Split each sequence into 4 parts\nflash_attention: true # Required with sequence parallelism\n...\nThis will train the Llama 3 8B model with 8K context length, with each sequence split\ninto 2 subsequences of length 4096 across 2 GPUs." }, { - "objectID": "index.html#quick-start", - "href": "index.html#quick-start", - "title": "Axolotl", - "section": "🚀 Quick Start", - "text": "🚀 Quick Start\nRequirements:\n\nNVIDIA GPU (Ampere or newer for bf16 and Flash Attention) or AMD GPU\nPython 3.11\nPyTorch ≥2.4.1\n\n\nInstallation\npip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs # OPTIONAL\nOther installation approaches are described here.\n\n\nYour First Fine-tune\n# Fetch axolotl examples\naxolotl fetch examples\n\n# Or, specify a custom path\naxolotl fetch examples --dest path/to/folder\n\n# Train a model using LoRA\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Check out our Getting Started Guide for a more detailed walkthrough.", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#key-features", - "href": "index.html#key-features", - "title": "Axolotl", - "section": "✨ Key Features", - "text": "✨ Key Features\n\nMultiple Model Support: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more\nTraining Methods: Full fine-tuning, LoRA, QLoRA, and more\nEasy Configuration: Simple YAML files to control your training setup\nPerformance Optimizations: Flash Attention, xformers, multi-GPU training\nFlexible Dataset Handling: Use various formats and custom datasets\nCloud Ready: Run on cloud platforms or local hardware", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#documentation", - "href": "index.html#documentation", - "title": "Axolotl", - "section": "📚 Documentation", - "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nFAQ - Frequently asked questions", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#getting-help", - "href": "index.html#getting-help", - "title": "Axolotl", - "section": "🤝 Getting Help", - "text": "🤝 Getting Help\n\nJoin our Discord community for support\nCheck out our Examples directory\nRead our Debugging Guide\nNeed dedicated support? Please contact ✉️wing@axolotl.ai for options", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#contributing", - "href": "index.html#contributing", - "title": "Axolotl", - "section": "🌟 Contributing", - "text": "🌟 Contributing\nContributions are welcome! Please see our Contributing Guide for details.", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#supported-models", - "href": "index.html#supported-models", - "title": "Axolotl", - "section": "Supported Models", - "text": "Supported Models\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfp16/fp32\nlora\nqlora\ngptq\ngptq w/flash attn\nflash attn\nxformers attn\n\n\n\n\nllama\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMistral\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMixtral-MoE\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nMixtral8X22\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nPythia\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ncerebras\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nbtlm\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nmpt\n✅\n❌\n❓\n❌\n❌\n❌\n❓\n\n\nfalcon\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ngpt-j\n✅\n✅\n✅\n❌\n❌\n❓\n❓\n\n\nXGen\n✅\n❓\n✅\n❓\n❓\n❓\n✅\n\n\nphi\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nRWKV\n✅\n❓\n❓\n❓\n❓\n❓\n❓\n\n\nQwen\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nGemma\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\nJamba\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\n\n✅: supported ❌: not supported ❓: untested", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#sponsors", - "href": "index.html#sponsors", - "title": "Axolotl", - "section": "❤️ Sponsors", - "text": "❤️ Sponsors\nThank you to our sponsors who help make Axolotl possible:\n\nModal - Modal lets you run jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, fine-tune large language models, run protein folding simulations, and much more.\n\nInterested in sponsoring? Contact us at wing@axolotl.ai", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "index.html#license", - "href": "index.html#license", - "title": "Axolotl", - "section": "📜 License", - "text": "📜 License\nThis project is licensed under the Apache 2.0 License - see the LICENSE file for details.", - "crumbs": [ - "Home" - ] - }, - { - "objectID": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html", - "href": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html", - "title": "Axolotl", + "objectID": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism", + "href": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism", + "title": "Sequence Parallelism", "section": "", - "text": "Acknowledgements\nPortions of this Cut Cross Entropy Software may utilize the following copyrighted material, the use of which is hereby acknowledged.\n\nPyTorch\nFrom PyTorch:\n\nCopyright (c) 2016- Facebook, Inc (Adam Paszke)\nCopyright (c) 2014- Facebook, Inc (Soumith Chintala)\nCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)\nCopyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)\nCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)\nCopyright (c) 2011-2013 NYU (Clement Farabet)\nCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)\nCopyright (c) 2006 Idiap Research Institute (Samy Bengio)\nCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)\n\nFrom Caffe2:\n\nCopyright (c) 2016-present, Facebook Inc. All rights reserved.\n\nAll contributions by Facebook:\nCopyright (c) 2016 Facebook Inc.\n\nAll contributions by Google:\nCopyright (c) 2015 Google Inc.\nAll rights reserved.\n\nAll contributions by Yangqing Jia:\nCopyright (c) 2015 Yangqing Jia\nAll rights reserved.\n\nAll contributions by Kakao Brain:\nCopyright 2019-2020 Kakao Brain\n\nAll contributions by Cruise LLC:\nCopyright (c) 2022 Cruise LLC.\nAll rights reserved.\n\nAll contributions by Arm:\nCopyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates\n\nAll contributions from Caffe:\nCopyright(c) 2013, 2014, 2015, the respective contributors\nAll rights reserved.\n\nAll other contributions:\nCopyright(c) 2015, 2016 the respective contributors\nAll rights reserved.\n\nCaffe2 uses a copyright model similar to Caffe: each contributor holds\ncopyright over their contributions to Caffe2. The project versioning records\nall such contribution and copyright details. If a contributor wants to further\nmark their specific copyright on a particular contribution, they should\nindicate their copyright solely in the commit message of the change when it is\ncommitted.\n\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright\nnotice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\nnotice, this list of conditions and the following disclaimer in the\ndocumentation and/or other materials provided with the distribution.\n\n3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America\nand IDIAP Research Institute nor the names of its contributors may be\nused to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\nTriton\n/*\n* Copyright 2018-2020 Philippe Tillet\n* Copyright 2020-2022 OpenAI\n*\n* Permission is hereby granted, free of charge, to any person obtaining\n* a copy of this software and associated documentation files\n* (the \"Software\"), to deal in the Software without restriction,\n* including without limitation the rights to use, copy, modify, merge,\n* publish, distribute, sublicense, and/or sell copies of the Software,\n* and to permit persons to whom the Software is furnished to do so,\n* subject to the following conditions:\n*\n* The above copyright notice and this permission notice shall be\n* included in all copies or substantial portions of the Software.\n*\n* THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n*/\nTransformers\nCopyright 2018- The Hugging Face team. All rights reserved.\n\n Apache License\n Version 2.0, January 2004\n http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n \"License\" shall mean the terms and conditions for use, reproduction,\n and distribution as defined by Sections 1 through 9 of this document.\n\n \"Licensor\" shall mean the copyright owner or entity authorized by\n the copyright owner that is granting the License.\n\n \"Legal Entity\" shall mean the union of the acting entity and all\n other entities that control, are controlled by, or are under common\n control with that entity. For the purposes of this definition,\n \"control\" means (i) the power, direct or indirect, to cause the\n direction or management of such entity, whether by contract or\n otherwise, or (ii) ownership of fifty percent (50%) or more of the\n outstanding shares, or (iii) beneficial ownership of such entity.\n\n \"You\" (or \"Your\") shall mean an individual or Legal Entity\n exercising permissions granted by this License.\n\n \"Source\" form shall mean the preferred form for making modifications,\n including but not limited to software source code, documentation\n source, and configuration files.\n\n \"Object\" form shall mean any form resulting from mechanical\n transformation or translation of a Source form, including but\n not limited to compiled object code, generated documentation,\n and conversions to other media types.\n\n \"Work\" shall mean the work of authorship, whether in Source or\n Object form, made available under the License, as indicated by a\n copyright notice that is included in or attached to the work\n (an example is provided in the Appendix below).\n\n \"Derivative Works\" shall mean any work, whether in Source or Object\n form, that is based on (or derived from) the Work and for which the\n editorial revisions, annotations, elaborations, or other modifications\n represent, as a whole, an original work of authorship. For the purposes\n of this License, Derivative Works shall not include works that remain\n separable from, or merely link (or bind by name) to the interfaces of,\n the Work and Derivative Works thereof.\n\n \"Contribution\" shall mean any work of authorship, including\n the original version of the Work and any modifications or additions\n to that Work or Derivative Works thereof, that is intentionally\n submitted to Licensor for inclusion in the Work by the copyright owner\n or by an individual or Legal Entity authorized to submit on behalf of\n the copyright owner. For the purposes of this definition, \"submitted\"\n means any form of electronic, verbal, or written communication sent\n to the Licensor or its representatives, including but not limited to\n communication on electronic mailing lists, source code control systems,\n and issue tracking systems that are managed by, or on behalf of, the\n Licensor for the purpose of discussing and improving the Work, but\n excluding communication that is conspicuously marked or otherwise\n designated in writing by the copyright owner as \"Not a Contribution.\"\n\n \"Contributor\" shall mean Licensor and any individual or Legal Entity\n on behalf of whom a Contribution has been received by Licensor and\n subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n copyright license to reproduce, prepare Derivative Works of,\n publicly display, publicly perform, sublicense, and distribute the\n Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n (except as stated in this section) patent license to make, have made,\n use, offer to sell, sell, import, and otherwise transfer the Work,\n where such license applies only to those patent claims licensable\n by such Contributor that are necessarily infringed by their\n Contribution(s) alone or by combination of their Contribution(s)\n with the Work to which such Contribution(s) was submitted. If You\n institute patent litigation against any entity (including a\n cross-claim or counterclaim in a lawsuit) alleging that the Work\n or a Contribution incorporated within the Work constitutes direct\n or contributory patent infringement, then any patent licenses\n granted to You under this License for that Work shall terminate\n as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n Work or Derivative Works thereof in any medium, with or without\n modifications, and in Source or Object form, provided that You\n meet the following conditions:\n\n (a) You must give any other recipients of the Work or\n Derivative Works a copy of this License; and\n\n (b) You must cause any modified files to carry prominent notices\n stating that You changed the files; and\n\n (c) You must retain, in the Source form of any Derivative Works\n that You distribute, all copyright, patent, trademark, and\n attribution notices from the Source form of the Work,\n excluding those notices that do not pertain to any part of\n the Derivative Works; and\n\n (d) If the Work includes a \"NOTICE\" text file as part of its\n distribution, then any Derivative Works that You distribute must\n include a readable copy of the attribution notices contained\n within such NOTICE file, excluding those notices that do not\n pertain to any part of the Derivative Works, in at least one\n of the following places: within a NOTICE text file distributed\n as part of the Derivative Works; within the Source form or\n documentation, if provided along with the Derivative Works; or,\n within a display generated by the Derivative Works, if and\n wherever such third-party notices normally appear. The contents\n of the NOTICE file are for informational purposes only and\n do not modify the License. You may add Your own attribution\n notices within Derivative Works that You distribute, alongside\n or as an addendum to the NOTICE text from the Work, provided\n that such additional attribution notices cannot be construed\n as modifying the License.\n\n You may add Your own copyright statement to Your modifications and\n may provide additional or different license terms and conditions\n for use, reproduction, or distribution of Your modifications, or\n for any such Derivative Works as a whole, provided Your use,\n reproduction, and distribution of the Work otherwise complies with\n the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n any Contribution intentionally submitted for inclusion in the Work\n by You to the Licensor shall be under the terms and conditions of\n this License, without any additional terms or conditions.\n Notwithstanding the above, nothing herein shall supersede or modify\n the terms of any separate license agreement you may have executed\n with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n names, trademarks, service marks, or product names of the Licensor,\n except as required for reasonable and customary use in describing the\n origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n agreed to in writing, Licensor provides the Work (and each\n Contributor provides its Contributions) on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n implied, including, without limitation, any warranties or conditions\n of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n PARTICULAR PURPOSE. You are solely responsible for determining the\n appropriateness of using or redistributing the Work and assume any\n risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n whether in tort (including negligence), contract, or otherwise,\n unless required by applicable law (such as deliberate and grossly\n negligent acts) or agreed to in writing, shall any Contributor be\n liable to You for damages, including any direct, indirect, special,\n incidental, or consequential damages of any character arising as a\n result of this License or out of the use or inability to use the\n Work (including but not limited to damages for loss of goodwill,\n work stoppage, computer failure or malfunction, or any and all\n other commercial damages or losses), even if such Contributor\n has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n the Work or Derivative Works thereof, You may choose to offer,\n and charge a fee for, acceptance of support, warranty, indemnity,\n or other liability obligations and/or rights consistent with this\n License. However, in accepting such obligations, You may act only\n on Your own behalf and on Your sole responsibility, not on behalf\n of any other Contributor, and only if You agree to indemnify,\n defend, and hold each Contributor harmless for any liability\n incurred by, or claims asserted against, such Contributor by reason\n of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n To apply the Apache License to your work, attach the following\n boilerplate notice, with the fields enclosed by brackets \"[]\"\n replaced with your own identifying information. (Don't include\n the brackets!) The text should be enclosed in the appropriate\n comment syntax for the file format. We also recommend that a\n file or class name and description of purpose be included on the\n same \"printed page\" as the copyright notice for easier\n identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License." + "text": "Sequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:\n\nSamples are first packed together\nThe packed sequences are then divided across GPUs in the sequence parallel group\nPosition IDs are automatically adjusted to maintain proper relative positions" }, { - "objectID": "FAQS.html", - "href": "FAQS.html", - "title": "FAQs", + "objectID": "docs/sequence_parallelism.html#effect-on-batch-size", + "href": "docs/sequence_parallelism.html#effect-on-batch-size", + "title": "Sequence Parallelism", "section": "", - "text": "FAQs\n\nCan you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this PR\nWill this work with Deepspeed? That’s still a WIP, but setting export ACCELERATE_USE_DEEPSPEED=true should work in some cases\nError invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c /arrow/cpp/src/arrow/filesystem/s3fs.cc:2598: arrow::fs::FinalizeS3 was not called even though S3 was initialized. This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source." + "text": "When using sequence parallelism, your effective global batch size is divided by the sequence_parallel_degree. This happens because:\n\nEach group of sequence_parallel_degree GPUs works on the same batch (just different parts of each sequence)\nThe number of batches processed per step decreases\n\nFor example:\n- With 8 GPUs and no sequence parallelism: 8 different batches processed per step\n- With 8 GPUs and sequence_parallel_degree=4: Only 2 different batches processed per step (each split across 4 GPUs)\n- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4" }, { "objectID": "docs/multi-node.html", @@ -774,12 +1544,236 @@ "Multi Node" ] }, + { + "objectID": "FAQS.html", + "href": "FAQS.html", + "title": "FAQs", + "section": "", + "text": "FAQs\n\nCan you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this PR\nWill this work with Deepspeed? That’s still a WIP, but setting export ACCELERATE_USE_DEEPSPEED=true should work in some cases\nError invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c\n/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598: arrow::fs::FinalizeS3 was not called even though S3 was initialized.\nThis could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source." + }, + { + "objectID": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html", + "href": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html", + "title": "Axolotl", + "section": "", + "text": "Acknowledgements\nPortions of this Cut Cross Entropy Software may utilize the following copyrighted\nmaterial, the use of which is hereby acknowledged.\n\nPyTorch\nFrom PyTorch:\n\nCopyright (c) 2016- Facebook, Inc (Adam Paszke)\nCopyright (c) 2014- Facebook, Inc (Soumith Chintala)\nCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)\nCopyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)\nCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)\nCopyright (c) 2011-2013 NYU (Clement Farabet)\nCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)\nCopyright (c) 2006 Idiap Research Institute (Samy Bengio)\nCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)\n\nFrom Caffe2:\n\nCopyright (c) 2016-present, Facebook Inc. All rights reserved.\n\nAll contributions by Facebook:\nCopyright (c) 2016 Facebook Inc.\n\nAll contributions by Google:\nCopyright (c) 2015 Google Inc.\nAll rights reserved.\n\nAll contributions by Yangqing Jia:\nCopyright (c) 2015 Yangqing Jia\nAll rights reserved.\n\nAll contributions by Kakao Brain:\nCopyright 2019-2020 Kakao Brain\n\nAll contributions by Cruise LLC:\nCopyright (c) 2022 Cruise LLC.\nAll rights reserved.\n\nAll contributions by Arm:\nCopyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates\n\nAll contributions from Caffe:\nCopyright(c) 2013, 2014, 2015, the respective contributors\nAll rights reserved.\n\nAll other contributions:\nCopyright(c) 2015, 2016 the respective contributors\nAll rights reserved.\n\nCaffe2 uses a copyright model similar to Caffe: each contributor holds\ncopyright over their contributions to Caffe2. The project versioning records\nall such contribution and copyright details. If a contributor wants to further\nmark their specific copyright on a particular contribution, they should\nindicate their copyright solely in the commit message of the change when it is\ncommitted.\n\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright\nnotice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\nnotice, this list of conditions and the following disclaimer in the\ndocumentation and/or other materials provided with the distribution.\n\n3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America\nand IDIAP Research Institute nor the names of its contributors may be\nused to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\nTriton\n/*\n* Copyright 2018-2020 Philippe Tillet\n* Copyright 2020-2022 OpenAI\n*\n* Permission is hereby granted, free of charge, to any person obtaining\n* a copy of this software and associated documentation files\n* (the \"Software\"), to deal in the Software without restriction,\n* including without limitation the rights to use, copy, modify, merge,\n* publish, distribute, sublicense, and/or sell copies of the Software,\n* and to permit persons to whom the Software is furnished to do so,\n* subject to the following conditions:\n*\n* The above copyright notice and this permission notice shall be\n* included in all copies or substantial portions of the Software.\n*\n* THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n*/\nTransformers\nCopyright 2018- The Hugging Face team. All rights reserved.\n\n Apache License\n Version 2.0, January 2004\n http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n \"License\" shall mean the terms and conditions for use, reproduction,\n and distribution as defined by Sections 1 through 9 of this document.\n\n \"Licensor\" shall mean the copyright owner or entity authorized by\n the copyright owner that is granting the License.\n\n \"Legal Entity\" shall mean the union of the acting entity and all\n other entities that control, are controlled by, or are under common\n control with that entity. For the purposes of this definition,\n \"control\" means (i) the power, direct or indirect, to cause the\n direction or management of such entity, whether by contract or\n otherwise, or (ii) ownership of fifty percent (50%) or more of the\n outstanding shares, or (iii) beneficial ownership of such entity.\n\n \"You\" (or \"Your\") shall mean an individual or Legal Entity\n exercising permissions granted by this License.\n\n \"Source\" form shall mean the preferred form for making modifications,\n including but not limited to software source code, documentation\n source, and configuration files.\n\n \"Object\" form shall mean any form resulting from mechanical\n transformation or translation of a Source form, including but\n not limited to compiled object code, generated documentation,\n and conversions to other media types.\n\n \"Work\" shall mean the work of authorship, whether in Source or\n Object form, made available under the License, as indicated by a\n copyright notice that is included in or attached to the work\n (an example is provided in the Appendix below).\n\n \"Derivative Works\" shall mean any work, whether in Source or Object\n form, that is based on (or derived from) the Work and for which the\n editorial revisions, annotations, elaborations, or other modifications\n represent, as a whole, an original work of authorship. For the purposes\n of this License, Derivative Works shall not include works that remain\n separable from, or merely link (or bind by name) to the interfaces of,\n the Work and Derivative Works thereof.\n\n \"Contribution\" shall mean any work of authorship, including\n the original version of the Work and any modifications or additions\n to that Work or Derivative Works thereof, that is intentionally\n submitted to Licensor for inclusion in the Work by the copyright owner\n or by an individual or Legal Entity authorized to submit on behalf of\n the copyright owner. For the purposes of this definition, \"submitted\"\n means any form of electronic, verbal, or written communication sent\n to the Licensor or its representatives, including but not limited to\n communication on electronic mailing lists, source code control systems,\n and issue tracking systems that are managed by, or on behalf of, the\n Licensor for the purpose of discussing and improving the Work, but\n excluding communication that is conspicuously marked or otherwise\n designated in writing by the copyright owner as \"Not a Contribution.\"\n\n \"Contributor\" shall mean Licensor and any individual or Legal Entity\n on behalf of whom a Contribution has been received by Licensor and\n subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n copyright license to reproduce, prepare Derivative Works of,\n publicly display, publicly perform, sublicense, and distribute the\n Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n (except as stated in this section) patent license to make, have made,\n use, offer to sell, sell, import, and otherwise transfer the Work,\n where such license applies only to those patent claims licensable\n by such Contributor that are necessarily infringed by their\n Contribution(s) alone or by combination of their Contribution(s)\n with the Work to which such Contribution(s) was submitted. If You\n institute patent litigation against any entity (including a\n cross-claim or counterclaim in a lawsuit) alleging that the Work\n or a Contribution incorporated within the Work constitutes direct\n or contributory patent infringement, then any patent licenses\n granted to You under this License for that Work shall terminate\n as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n Work or Derivative Works thereof in any medium, with or without\n modifications, and in Source or Object form, provided that You\n meet the following conditions:\n\n (a) You must give any other recipients of the Work or\n Derivative Works a copy of this License; and\n\n (b) You must cause any modified files to carry prominent notices\n stating that You changed the files; and\n\n (c) You must retain, in the Source form of any Derivative Works\n that You distribute, all copyright, patent, trademark, and\n attribution notices from the Source form of the Work,\n excluding those notices that do not pertain to any part of\n the Derivative Works; and\n\n (d) If the Work includes a \"NOTICE\" text file as part of its\n distribution, then any Derivative Works that You distribute must\n include a readable copy of the attribution notices contained\n within such NOTICE file, excluding those notices that do not\n pertain to any part of the Derivative Works, in at least one\n of the following places: within a NOTICE text file distributed\n as part of the Derivative Works; within the Source form or\n documentation, if provided along with the Derivative Works; or,\n within a display generated by the Derivative Works, if and\n wherever such third-party notices normally appear. The contents\n of the NOTICE file are for informational purposes only and\n do not modify the License. You may add Your own attribution\n notices within Derivative Works that You distribute, alongside\n or as an addendum to the NOTICE text from the Work, provided\n that such additional attribution notices cannot be construed\n as modifying the License.\n\n You may add Your own copyright statement to Your modifications and\n may provide additional or different license terms and conditions\n for use, reproduction, or distribution of Your modifications, or\n for any such Derivative Works as a whole, provided Your use,\n reproduction, and distribution of the Work otherwise complies with\n the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n any Contribution intentionally submitted for inclusion in the Work\n by You to the Licensor shall be under the terms and conditions of\n this License, without any additional terms or conditions.\n Notwithstanding the above, nothing herein shall supersede or modify\n the terms of any separate license agreement you may have executed\n with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n names, trademarks, service marks, or product names of the Licensor,\n except as required for reasonable and customary use in describing the\n origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n agreed to in writing, Licensor provides the Work (and each\n Contributor provides its Contributions) on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n implied, including, without limitation, any warranties or conditions\n of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n PARTICULAR PURPOSE. You are solely responsible for determining the\n appropriateness of using or redistributing the Work and assume any\n risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n whether in tort (including negligence), contract, or otherwise,\n unless required by applicable law (such as deliberate and grossly\n negligent acts) or agreed to in writing, shall any Contributor be\n liable to You for damages, including any direct, indirect, special,\n incidental, or consequential damages of any character arising as a\n result of this License or out of the use or inability to use the\n Work (including but not limited to damages for loss of goodwill,\n work stoppage, computer failure or malfunction, or any and all\n other commercial damages or losses), even if such Contributor\n has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n the Work or Derivative Works thereof, You may choose to offer,\n and charge a fee for, acceptance of support, warranty, indemnity,\n or other liability obligations and/or rights consistent with this\n License. However, in accepting such obligations, You may act only\n on Your own behalf and on Your sole responsibility, not on behalf\n of any other Contributor, and only if You agree to indemnify,\n defend, and hold each Contributor harmless for any liability\n incurred by, or claims asserted against, such Contributor by reason\n of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n To apply the Apache License to your work, attach the following\n boilerplate notice, with the fields enclosed by brackets \"[]\"\n replaced with your own identifying information. (Don't include\n the brackets!) The text should be enclosed in the appropriate\n comment syntax for the file format. We also recommend that a\n file or class name and description of purpose be included on the\n same \"printed page\" as the copyright notice for easier\n identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License." + }, + { + "objectID": "index.html", + "href": "index.html", + "title": "Axolotl", + "section": "", + "text": "Axolotl is a tool designed to streamline post-training for various AI models.\nPost-training refers to any modifications or additional training performed on\npre-trained models - including full model fine-tuning, parameter-efficient tuning (like\nLoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment\ntechniques. With support for multiple model architectures and training configurations,\nAxolotl makes it easy to get started with these techniques.\nAxolotl is designed to work with YAML config files that contain everything you need to\npreprocess a dataset, train or fine-tune a model, run model inference or evaluation,\nand much more.\nFeatures:", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#quick-start", + "href": "index.html#quick-start", + "title": "Axolotl", + "section": "🚀 Quick Start", + "text": "🚀 Quick Start\nRequirements:\n\nNVIDIA GPU (Ampere or newer for bf16 and Flash Attention) or AMD GPU\nPython 3.11\nPyTorch ≥2.4.1\n\n\nInstallation\npip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs # OPTIONAL\nOther installation approaches are described here.\n\n\nYour First Fine-tune\n# Fetch axolotl examples\naxolotl fetch examples\n\n# Or, specify a custom path\naxolotl fetch examples --dest path/to/folder\n\n# Train a model using LoRA\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Check out our Getting Started Guide for a more detailed walkthrough.", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#key-features", + "href": "index.html#key-features", + "title": "Axolotl", + "section": "✨ Key Features", + "text": "✨ Key Features\n\nMultiple Model Support: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more\nTraining Methods: Full fine-tuning, LoRA, QLoRA, and more\nEasy Configuration: Simple YAML files to control your training setup\nPerformance Optimizations: Flash Attention, xformers, multi-GPU training\nFlexible Dataset Handling: Use various formats and custom datasets\nCloud Ready: Run on cloud platforms or local hardware", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#documentation", + "href": "index.html#documentation", + "title": "Axolotl", + "section": "📚 Documentation", + "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#getting-help", + "href": "index.html#getting-help", + "title": "Axolotl", + "section": "🤝 Getting Help", + "text": "🤝 Getting Help\n\nJoin our Discord community for support\nCheck out our Examples directory\nRead our Debugging Guide\nNeed dedicated support? Please contact ✉️wing@axolotl.ai for options", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#contributing", + "href": "index.html#contributing", + "title": "Axolotl", + "section": "🌟 Contributing", + "text": "🌟 Contributing\nContributions are welcome! Please see our Contributing Guide for details.", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#supported-models", + "href": "index.html#supported-models", + "title": "Axolotl", + "section": "Supported Models", + "text": "Supported Models\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfp16/fp32\nlora\nqlora\ngptq\ngptq w/flash attn\nflash attn\nxformers attn\n\n\n\n\nllama\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMistral\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMixtral-MoE\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nMixtral8X22\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nPythia\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ncerebras\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nbtlm\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nmpt\n✅\n❌\n❓\n❌\n❌\n❌\n❓\n\n\nfalcon\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ngpt-j\n✅\n✅\n✅\n❌\n❌\n❓\n❓\n\n\nXGen\n✅\n❓\n✅\n❓\n❓\n❓\n✅\n\n\nphi\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nRWKV\n✅\n❓\n❓\n❓\n❓\n❓\n❓\n\n\nQwen\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nGemma\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\nJamba\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\n\n✅: supported\n❌: not supported\n❓: untested", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#sponsors", + "href": "index.html#sponsors", + "title": "Axolotl", + "section": "❤️ Sponsors", + "text": "❤️ Sponsors\nThank you to our sponsors who help make Axolotl possible:\n\nModal - Modal lets you run\njobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,\nfine-tune large language models, run protein folding simulations, and much more.\n\nInterested in sponsoring? Contact us at wing@axolotl.ai", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#license", + "href": "index.html#license", + "title": "Axolotl", + "section": "📜 License", + "text": "📜 License\nThis project is licensed under the Apache 2.0 License - see the LICENSE file for details.", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "src/axolotl/integrations/LICENSE.html", + "href": "src/axolotl/integrations/LICENSE.html", + "title": "Axolotl", + "section": "", + "text": "AXOLOTL COMMUNITY LICENSE AGREEMENT\nThis Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and\nany individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms\nand conditions set forth in this Agreement.\n\nDefinitions\n1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement.\n1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl,\nwhich may be licensed separately by their respective authors and/or licensors.\n1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at\nhttps://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which\npermits Plugin Integrations to integrate with the Axolotl service.\nGrant of License\n2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge,\npublish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions:\n- Licensee must comply with all the terms and conditions of this Agreement.\n- Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial\nportions of the Software.\n2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.\nRestrictions\n3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for\nfree or for sale any services, platform, or equivalent to third parties for the purposes of allowing such\nthird parties to fine-tune artificial intelligence models.\n3.2 Licensee shall not:\n- Use the Software for any illegal or unauthorized purpose.\n- Reverse engineer, decompile, or disassemble the Software.\n- Remove or modify any copyright, trademark, or other proprietary notices contained in the Software.\n- Use the Software in a way that could damage, disable, overburden, or impair the functionality of the\nSoftware or interfere with any third-party use of the Software.\n3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.\nIntellectual Property Rights\n4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee\nacknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to\nLicensee.\nDisclaimer of Warranty\n5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED\nTO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL\nTHE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF\nCONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\nDEALINGS IN THE SOFTWARE.\nTermination\n6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and\nconditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any\ncopies in its possession.\nGoverning Law\n7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California,\nwithout regards to conflicts of laws provisions thereof.\nEntire Agreement\n8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter\nhereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning\nthe Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and\nLicensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms\non a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any\nmaterial updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be\nbound by the terms and conditions of this Agreement.\n\nThis Agreement was last updated on August 23, 2024." + }, + { + "objectID": "TODO.html", + "href": "TODO.html", + "title": "todo list", + "section": "", + "text": "[] Validation of parameters for combinations that won’t work\n\n\n\n\nFSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload" + }, + { + "objectID": "TODO.html#things-that-are-known-not-to-work", + "href": "TODO.html#things-that-are-known-not-to-work", + "title": "todo list", + "section": "", + "text": "FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload" + }, + { + "objectID": "docs/lr_groups.html", + "href": "docs/lr_groups.html", + "title": "Learning Rate Groups", + "section": "", + "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.", + "crumbs": [ + "How To Guides", + "Learning Rate Groups" + ] + }, + { + "objectID": "docs/lr_groups.html#background", + "href": "docs/lr_groups.html#background", + "title": "Learning Rate Groups", + "section": "", + "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.", + "crumbs": [ + "How To Guides", + "Learning Rate Groups" + ] + }, + { + "objectID": "docs/lr_groups.html#example", + "href": "docs/lr_groups.html#example", + "title": "Learning Rate Groups", + "section": "Example", + "text": "Example\nlr_groups:\n - name: o_proj\n modules:\n - self_attn.o_proj.weight\n lr: 1e-6\n - name: q_proj\n modules:\n - model.layers.2.self_attn.q_proj.weight\n lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.", + "crumbs": [ + "How To Guides", + "Learning Rate Groups" + ] + }, + { + "objectID": "docs/debugging.html", + "href": "docs/debugging.html", + "title": "Debugging", + "section": "", + "text": "This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.", + "crumbs": [ + "Troubleshooting", + "Debugging" + ] + }, + { + "objectID": "docs/debugging.html#table-of-contents", + "href": "docs/debugging.html#table-of-contents", + "title": "Debugging", + "section": "Table of Contents", + "text": "Table of Contents\n\nGeneral Tips\nDebugging with VSCode\n\nBackground\nConfiguration\nCustomizing your debugger\nVideo Tutorial\n\nDebugging With Docker\n\nSetup\nAttach To Container\nVideo - Attaching To Docker On Remote Host", + "crumbs": [ + "Troubleshooting", + "Debugging" + ] + }, + { + "objectID": "docs/debugging.html#general-tips", + "href": "docs/debugging.html#general-tips", + "title": "Debugging", + "section": "General Tips", + "text": "General Tips\nWhile debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:\n\n[!Important]\nAll of these tips are incorporated into the example configuration for debugging with VSCode below.\n\n\nMake sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.\nEliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:\n\nSet CUDA_VISIBLE_DEVICES to a single GPU, ex: export CUDA_VISIBLE_DEVICES=0.\nSet dataset_processes: 1 in your axolotl config or run the training command with --dataset_processes=1.\n\nUse a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):\ndatasets:\n ...\n shards: 20\nUse a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.\nMinimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.\n\nmicro_batch_size: 1\nmax_steps: 1\nval_set_size: 0\n\nClear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.\n\nData preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in dataset_prepared_path: in your axolotl config. If you didn’t set this value, the default is last_run_prepared.\nHF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache HuggingFace cache, by deleting the appropriate ~/.cache/huggingface/datasets/... folder(s).\nThe recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.", + "crumbs": [ + "Troubleshooting", + "Debugging" + ] + }, + { + "objectID": "docs/debugging.html#debugging-with-vscode", + "href": "docs/debugging.html#debugging-with-vscode", + "title": "Debugging", + "section": "Debugging with VSCode", + "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n \"version\": \"0.2.0\",\n \"configurations\": [\n {\n \"name\": \"Debug axolotl prompt - chat_template\",\n \"type\": \"python\",\n \"module\": \"accelerate.commands.launch\",\n \"request\": \"launch\",\n \"args\": [\n \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n // The flags below simplify debugging by overriding the axolotl config\n // with the debugging tips above. Modify as needed.\n \"--dataset_processes=1\", // limits data preprocessing to one process\n \"--max_steps=1\", // limits training to just one step\n \"--batch_size=1\", // minimizes batch size\n \"--micro_batch_size=1\", // minimizes batch size\n \"--val_set_size=0\", // disables validation\n \"--sample_packing=False\", // disables sample packing which is necessary for small datasets\n \"--eval_sample_packing=False\",// disables sample packing on eval set\n \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n ],\n \"console\": \"integratedTerminal\", // show output in the integrated terminal\n \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n \"justMyCode\": true, // step through only axolotl code\n \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\", // Since we aren't doing distributed training, we need to limit to one GPU\n \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n }\n ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n \"version\": \"2.0.0\",\n \"tasks\": [\n // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n {\n \"label\": \"delete-outputs\",\n \"type\": \"shell\",\n \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n \"problemMatcher\": []\n },\n // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n {\n \"label\": \"delete-temp-hf-dataset-cache\",\n \"type\": \"shell\",\n \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n \"problemMatcher\": []\n },\n // this task combines the two tasks above\n {\n \"label\": \"cleanup-for-dataprep\",\n \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n }\n ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode", + "crumbs": [ + "Troubleshooting", + "Debugging" + ] + }, + { + "objectID": "docs/debugging.html#debugging-with-docker", + "href": "docs/debugging.html#debugging-with-docker", + "title": "Debugging", + "section": "Debugging With Docker", + "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host", + "crumbs": [ + "Troubleshooting", + "Debugging" + ] + }, + { + "objectID": "docs/debugging.html#footnotes", + "href": "docs/debugging.html#footnotes", + "title": "Debugging", + "section": "Footnotes", + "text": "Footnotes\n\n\nThe config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎\nMany of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎", + "crumbs": [ + "Troubleshooting", + "Debugging" + ] + }, { "objectID": "docs/faq.html", "href": "docs/faq.html", "title": "FAQ", "section": "", - "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: Exitcode -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: Exitcode -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Yes, since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS/EOT token is incorrectly being masked or not being masked.\n\nA: This is because of the mismatch between tokenizer.eos_token and EOS/EOT token in template. Please make sure to set eos_token under special_tokens to the same EOS/EOT token as in template.\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.", + "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: Exitcode -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: Exitcode -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Yes, since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_<model_name>.py file within transformers library.\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS/EOT token is incorrectly being masked or not being masked.\n\nA: This is because of the mismatch between tokenizer.eos_token and EOS/EOT token in template. Please make sure to set eos_token under special_tokens to the same EOS/EOT token as in template.\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.", "crumbs": [ "Troubleshooting", "FAQ" @@ -790,7 +1784,7 @@ "href": "docs/batch_vs_grad.html", "title": "Batch size vs Gradient accumulation", "section": "", - "text": "Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.\nThis method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:\n\nMemory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.\nGradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.\n\nExample 1: Micro batch size: 3 Gradient accumulation steps: 2 Number of GPUs: 3 Total batch size = 3 * 2 * 3 = 18\n| GPU 1 | GPU 2 | GPU 3 |\n|----------------|----------------|----------------|\n| S1, S2, S3 | S4, S5, S6 | S7, S8, S9 |\n| e1, e2, e3 | e4, e5, e6 | e7, e8, e9 |\n|----------------|----------------|----------------|\n| → (accumulate) | → (accumulate) | → (accumulate) |\n|----------------|----------------|----------------|\n| S10, S11, S12 | S13, S14, S15 | S16, S17, S18 |\n| e10, e11, e12 | e13, e14, e15 | e16, e17, e18 |\n|----------------|----------------|----------------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 after the second iteration (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18\n\nWeight update for w1:\nw1_new = w1_old - learning rate x (Total gradient for w1 / 18)\nExample 2: Micro batch size: 2 Gradient accumulation steps: 1 Number of GPUs: 3 Total batch size = 2 * 1 * 3 = 6\n| GPU 1 | GPU 2 | GPU 3 |\n|-----------|-----------|-----------|\n| S1, S2 | S3, S4 | S5, S6 |\n| e1, e2 | e3, e4 | e5, e6 |\n|-----------|-----------|-----------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6\n\nWeight update for w1:\nw1_new = w1_old - learning rate × (Total gradient for w1 / 6)", + "text": "Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.\nThis method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:\n\nMemory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.\nGradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.\n\nExample 1:\nMicro batch size: 3\nGradient accumulation steps: 2\nNumber of GPUs: 3\nTotal batch size = 3 * 2 * 3 = 18\n| GPU 1 | GPU 2 | GPU 3 |\n|----------------|----------------|----------------|\n| S1, S2, S3 | S4, S5, S6 | S7, S8, S9 |\n| e1, e2, e3 | e4, e5, e6 | e7, e8, e9 |\n|----------------|----------------|----------------|\n| → (accumulate) | → (accumulate) | → (accumulate) |\n|----------------|----------------|----------------|\n| S10, S11, S12 | S13, S14, S15 | S16, S17, S18 |\n| e10, e11, e12 | e13, e14, e15 | e16, e17, e18 |\n|----------------|----------------|----------------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 after the second iteration (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18\n\nWeight update for w1:\nw1_new = w1_old - learning rate x (Total gradient for w1 / 18)\nExample 2:\nMicro batch size: 2\nGradient accumulation steps: 1\nNumber of GPUs: 3\nTotal batch size = 2 * 1 * 3 = 6\n| GPU 1 | GPU 2 | GPU 3 |\n|-----------|-----------|-----------|\n| S1, S2 | S3, S4 | S5, S6 |\n| e1, e2 | e3, e4 | e5, e6 |\n|-----------|-----------|-----------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6\n\nWeight update for w1:\nw1_new = w1_old - learning rate × (Total gradient for w1 / 6)", "crumbs": [ "Core Concepts", "Batch size vs Gradient accumulation" @@ -801,7 +1795,7 @@ "href": "docs/lora_optims.html", "title": "LoRA Optimizations", "section": "", - "text": "Inspired by Unsloth, we’ve implemented two optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU (in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was to leverage operator fusion and tensor re-use in order to improve speed and reduce memory usage during the forward and backward passes of these calculations.\nWe currently support several common model architectures, including (but not limited to):", + "text": "Inspired by Unsloth, we’ve implemented two\noptimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU\n(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function\nTriton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was\nto leverage operator fusion and tensor re-use in order to improve speed and reduce\nmemory usage during the forward and backward passes of these calculations.\nWe currently support several common model architectures, including (but not limited to):", "crumbs": [ "How To Guides", "LoRA Optimizations" @@ -812,7 +1806,7 @@ "href": "docs/lora_optims.html#usage", "title": "LoRA Optimizations", "section": "Usage", - "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The lora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and lora_o_kernel enable the fused query-key-value projection and optimized output projection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true", + "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true", "crumbs": [ "How To Guides", "LoRA Optimizations" @@ -823,7 +1817,7 @@ "href": "docs/lora_optims.html#requirements", "title": "LoRA Optimizations", "section": "Requirements", - "text": "Requirements\n\nOne or more NVIDIA or AMD GPUs (in order to use the Triton kernels)\n\nNote: Set TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 to enable memory-efficient attention on AMD GPUs\n\nTargeted LoRA adapters cannot use Dropout\n\nThis may limit model expressivity / cause overfitting\n\nTargeted LoRA adapters cannot have bias terms\n\nThis may limit model expressivity\n\n\nModels with pre-existing LoRA adapters that use Dropout or have bias terms may need to be re-finetuned without these features in order to be useful.", + "text": "Requirements\n\nOne or more NVIDIA or AMD GPUs (in order to use the Triton kernels)\n\nNote: Set TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 to enable memory-efficient attention on AMD GPUs\n\nTargeted LoRA adapters cannot use Dropout\n\nThis may limit model expressivity / cause overfitting\n\nTargeted LoRA adapters cannot have bias terms\n\nThis may limit model expressivity\n\n\nModels with pre-existing LoRA adapters that use Dropout or have bias terms may need to\nbe re-finetuned without these features in order to be useful.", "crumbs": [ "How To Guides", "LoRA Optimizations" @@ -834,7 +1828,7 @@ "href": "docs/lora_optims.html#implementation-details", "title": "LoRA Optimizations", "section": "Implementation details", - "text": "Implementation details\n\nCustom autograd functions\nThe LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the LoRA and base weight computations together and provides a single, efficient backward pass for the entire MLP block.\nFor attention components, similar optimizations are provided through a function that handles the query, key, and value projections, and a function that handles the output projection. They are designed to work with the existing transformers attention implementation via some monkey-patching logic.\n\n\nTriton kernels\nTwo activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for improved speed and memory performance. These kernels handle both the forward and backward passes.\n\n\nIntegration\nThe custom autograd functions and Triton kernels are designed to work together. The autograd function manages the high-level computation flow and gradient tracking, while calling the Triton kernels for the activation function computation. During the backward pass, the kernel computes both the activation output and the required gradients, which the autograd function then uses to compute the final gradients for the entire computation path.", + "text": "Implementation details\n\nCustom autograd functions\nThe LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the\nLoRA and base weight computations together and provides a single, efficient backward\npass for the entire MLP block.\nFor attention components, similar optimizations are provided through a function that\nhandles the query, key, and value projections, and a function that handles the output\nprojection. They are designed to work with the existing transformers attention\nimplementation via some monkey-patching logic.\n\n\nTriton kernels\nTwo activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for\nimproved speed and memory performance. These kernels handle both the forward and\nbackward passes.\n\n\nIntegration\nThe custom autograd functions and Triton kernels are designed to work together. The\nautograd function manages the high-level computation flow and gradient tracking, while\ncalling the Triton kernels for the activation function computation. During the backward\npass, the kernel computes both the activation output and the required gradients, which\nthe autograd function then uses to compute the final gradients for the entire\ncomputation path.", "crumbs": [ "How To Guides", "LoRA Optimizations" @@ -851,12 +1845,838 @@ "LoRA Optimizations" ] }, + { + "objectID": "docs/api/utils.lora_embeddings.html", + "href": "docs/api/utils.lora_embeddings.html", + "title": "utils.lora_embeddings", + "section": "", + "text": "utils.lora_embeddings\nhelpers for lora embeddings\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_linear_embedding_layers\nreturns the linear embedding layers needed for loras, dependent on the model arch\n\n\n\n\n\nutils.lora_embeddings.get_linear_embedding_layers(model_type)\nreturns the linear embedding layers needed for loras, dependent on the model arch" + }, + { + "objectID": "docs/api/utils.lora_embeddings.html#functions", + "href": "docs/api/utils.lora_embeddings.html#functions", + "title": "utils.lora_embeddings", + "section": "", + "text": "Name\nDescription\n\n\n\n\nget_linear_embedding_layers\nreturns the linear embedding layers needed for loras, dependent on the model arch\n\n\n\n\n\nutils.lora_embeddings.get_linear_embedding_layers(model_type)\nreturns the linear embedding layers needed for loras, dependent on the model arch" + }, + { + "objectID": "docs/api/kernels.utils.html", + "href": "docs/api/kernels.utils.html", + "title": "kernels.utils", + "section": "", + "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules." + }, + { + "objectID": "docs/api/prompt_strategies.chat_template.html", + "href": "docs/api/prompt_strategies.chat_template.html", + "title": "prompt_strategies.chat_template", + "section": "", + "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n self,\n tokenizer,\n chat_template,\n processor=None,\n max_length=2048,\n message_property_mappings=None,\n message_field_training=None,\n message_field_training_detail=None,\n field_messages='messages',\n roles=None,\n drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs,\n sequence_len,\n roles_to_train=None,\n train_on_eos=None,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration." + }, + { + "objectID": "docs/api/prompt_strategies.chat_template.html#classes", + "href": "docs/api/prompt_strategies.chat_template.html#classes", + "title": "prompt_strategies.chat_template", + "section": "", + "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n self,\n tokenizer,\n chat_template,\n processor=None,\n max_length=2048,\n message_property_mappings=None,\n message_field_training=None,\n message_field_training_detail=None,\n field_messages='messages',\n roles=None,\n drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs,\n sequence_len,\n roles_to_train=None,\n train_on_eos=None,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration." + }, + { + "objectID": "docs/api/convert.html", + "href": "docs/api/convert.html", + "title": "convert", + "section": "", + "text": "convert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(self, file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n self,\n file_reader,\n file_writer,\n json_parser,\n jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout" + }, + { + "objectID": "docs/api/convert.html#classes", + "href": "docs/api/convert.html#classes", + "title": "convert", + "section": "", + "text": "Name\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(self, file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n self,\n file_reader,\n file_writer,\n json_parser,\n jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout" + }, + { + "objectID": "docs/api/common.const.html", + "href": "docs/api/common.const.html", + "title": "common.const", + "section": "", + "text": "common.const\ncommon.const\nVarious shared constants" + }, + { + "objectID": "docs/api/cli.cloud.base.html", + "href": "docs/api/cli.cloud.base.html", + "title": "cli.cloud.base", + "section": "", + "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms." + }, + { + "objectID": "docs/api/cli.cloud.base.html#classes", + "href": "docs/api/cli.cloud.base.html#classes", + "title": "cli.cloud.base", + "section": "", + "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms." + }, + { + "objectID": "docs/api/monkeypatch.relora.html", + "href": "docs/api/monkeypatch.relora.html", + "title": "monkeypatch.relora", + "section": "", + "text": "monkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\n\n\n\nName\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\nReLoRAScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(self, cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\nmonkeypatch.relora.ReLoRAScheduler(\n self,\n optimizer,\n inner_schedule,\n relora_steps,\n warmup_steps,\n anneal_steps=1,\n min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups." + }, + { + "objectID": "docs/api/monkeypatch.relora.html#classes", + "href": "docs/api/monkeypatch.relora.html#classes", + "title": "monkeypatch.relora", + "section": "", + "text": "Name\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\nReLoRAScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(self, cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\nmonkeypatch.relora.ReLoRAScheduler(\n self,\n optimizer,\n inner_schedule,\n relora_steps,\n warmup_steps,\n anneal_steps=1,\n min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups." + }, + { + "objectID": "docs/api/utils.lora.html", + "href": "docs/api/utils.lora.html", + "title": "utils.lora", + "section": "", + "text": "utils.lora\nmodule to get the state dict of a merged lora model\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters." + }, + { + "objectID": "docs/api/utils.lora.html#functions", + "href": "docs/api/utils.lora.html#functions", + "title": "utils.lora", + "section": "", + "text": "Name\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters." + }, + { + "objectID": "docs/api/cli.merge_lora.html", + "href": "docs/api/cli.merge_lora.html", + "title": "cli.merge_lora", + "section": "", + "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired" + }, + { + "objectID": "docs/api/cli.merge_lora.html#functions", + "href": "docs/api/cli.merge_lora.html#functions", + "title": "cli.merge_lora", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired" + }, + { + "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html", + "href": "docs/api/prompt_strategies.bradley_terry.llama3.html", + "title": "prompt_strategies.bradley_terry.llama3", + "section": "", + "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs" + }, + { + "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions", + "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions", + "title": "prompt_strategies.bradley_terry.llama3", + "section": "", + "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs" + }, + { + "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html", + "href": "docs/api/cli.merge_sharded_fsdp_weights.html", + "title": "cli.merge_sharded_fsdp_weights", + "section": "", + "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n checkpoint_dir,\n output_path,\n safe_serialization=False,\n remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version < 2.3.0, or if checkpoint_dir does not exist." + }, + { + "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes", + "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes", + "title": "cli.merge_sharded_fsdp_weights", + "section": "", + "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading." + }, + { + "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions", + "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions", + "title": "cli.merge_sharded_fsdp_weights", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n checkpoint_dir,\n output_path,\n safe_serialization=False,\n remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version < 2.3.0, or if checkpoint_dir does not exist." + }, + { + "objectID": "docs/api/integrations.spectrum.args.html", + "href": "docs/api/integrations.spectrum.args.html", + "title": "integrations.spectrum.args", + "section": "", + "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum." + }, + { + "objectID": "docs/api/integrations.spectrum.args.html#classes", + "href": "docs/api/integrations.spectrum.args.html#classes", + "title": "integrations.spectrum.args", + "section": "", + "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum." + }, + { + "objectID": "docs/api/models.mamba.modeling_mamba.html", + "href": "docs/api/models.mamba.modeling_mamba.html", + "title": "models.mamba.modeling_mamba", + "section": "", + "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba" + }, + { + "objectID": "docs/api/common.architectures.html", + "href": "docs/api/common.architectures.html", + "title": "common.architectures", + "section": "", + "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants" + }, + { + "objectID": "docs/api/utils.trainer.html", + "href": "docs/api/utils.trainer.html", + "title": "utils.trainer", + "section": "", + "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (> sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n sample,\n max_context_len=32768,\n split_on_token_ids=None,\n chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)\nDrop samples whose sequence length is either too long (> sequence_len)\nor too short (< min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\n\n\n\nutils.trainer.setup_trainer(\n cfg,\n train_dataset,\n eval_dataset,\n model,\n tokenizer,\n processor,\n total_num_steps,\n model_ref=None,\n peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters." + }, + { + "objectID": "docs/api/utils.trainer.html#functions", + "href": "docs/api/utils.trainer.html#functions", + "title": "utils.trainer", + "section": "", + "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (> sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n sample,\n max_context_len=32768,\n split_on_token_ids=None,\n chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)\nDrop samples whose sequence length is either too long (> sequence_len)\nor too short (< min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\n\n\n\nutils.trainer.setup_trainer(\n cfg,\n train_dataset,\n eval_dataset,\n model,\n tokenizer,\n processor,\n total_num_steps,\n model_ref=None,\n peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters." + }, + { + "objectID": "docs/api/utils.callbacks.comet_.html", + "href": "docs/api/utils.callbacks.comet_.html", + "title": "utils.callbacks.comet_", + "section": "", + "text": "utils.callbacks.comet_\nComet module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(\n self,\n axolotl_config_path,\n)\nCallback to save axolotl config to comet" + }, + { + "objectID": "docs/api/utils.callbacks.comet_.html#classes", + "href": "docs/api/utils.callbacks.comet_.html#classes", + "title": "utils.callbacks.comet_", + "section": "", + "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(\n self,\n axolotl_config_path,\n)\nCallback to save axolotl config to comet" + }, + { + "objectID": "docs/api/monkeypatch.llama_patch_multipack.html", + "href": "docs/api/monkeypatch.llama_patch_multipack.html", + "title": "monkeypatch.llama_patch_multipack", + "section": "", + "text": "monkeypatch.llama_patch_multipack\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention" + }, + { + "objectID": "docs/api/utils.gradient_checkpointing.unsloth.html", + "href": "docs/api/utils.gradient_checkpointing.unsloth.html", + "title": "utils.gradient_checkpointing.unsloth", + "section": "", + "text": "utils.gradient_checkpointing.unsloth\nUnsloth checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nUnsloth_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nutils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer()\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls." + }, + { + "objectID": "docs/api/utils.gradient_checkpointing.unsloth.html#classes", + "href": "docs/api/utils.gradient_checkpointing.unsloth.html#classes", + "title": "utils.gradient_checkpointing.unsloth", + "section": "", + "text": "Name\nDescription\n\n\n\n\nUnsloth_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nutils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer()\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls." + }, + { + "objectID": "docs/api/core.trainers.base.html", + "href": "docs/api/core.trainers.base.html", + "title": "core.trainers.base", + "section": "", + "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n self,\n *_args,\n bench_data_collator=None,\n eval_data_collator=None,\n dataset_tags=None,\n **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_eval_dataloader\nGet dataloader for evaluation\n\n\nget_train_dataloader\nGet dataloader for training\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\ntraining_step\nPerform a training step on a batch of inputs. Overrides the\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.get_eval_dataloader(eval_dataset=None)\nGet dataloader for evaluation\n\n\n\ncore.trainers.base.AxolotlTrainer.get_train_dataloader()\nGet dataloader for training\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.training_step(\n model,\n inputs,\n num_items_in_batch=None,\n)\nPerform a training step on a batch of inputs. Overrides the\ntransformers.trainer.Trainer method to handle sequence parallelism if\nenabled.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nnn.Module\nModel to perform training step for.\nrequired\n\n\ninputs\ndict[str, torch.Tensor | Any]\nDictionary mapping.\nrequired" + }, + { + "objectID": "docs/api/core.trainers.base.html#classes", + "href": "docs/api/core.trainers.base.html#classes", + "title": "core.trainers.base", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n self,\n *_args,\n bench_data_collator=None,\n eval_data_collator=None,\n dataset_tags=None,\n **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_eval_dataloader\nGet dataloader for evaluation\n\n\nget_train_dataloader\nGet dataloader for training\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\ntraining_step\nPerform a training step on a batch of inputs. Overrides the\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.get_eval_dataloader(eval_dataset=None)\nGet dataloader for evaluation\n\n\n\ncore.trainers.base.AxolotlTrainer.get_train_dataloader()\nGet dataloader for training\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.training_step(\n model,\n inputs,\n num_items_in_batch=None,\n)\nPerform a training step on a batch of inputs. Overrides the\ntransformers.trainer.Trainer method to handle sequence parallelism if\nenabled.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nnn.Module\nModel to perform training step for.\nrequired\n\n\ninputs\ndict[str, torch.Tensor | Any]\nDictionary mapping.\nrequired" + }, + { + "objectID": "docs/api/monkeypatch.unsloth_.html", + "href": "docs/api/monkeypatch.unsloth_.html", + "title": "monkeypatch.unsloth_", + "section": "", + "text": "monkeypatch.unsloth_\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations" + }, + { + "objectID": "docs/api/utils.samplers.multipack.html", + "href": "docs/api/utils.samplers.multipack.html", + "title": "utils.samplers.multipack", + "section": "", + "text": "utils.samplers.multipack\nMultipack Batch Sampler\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for multipack\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n self,\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=16,\n **kwargs,\n)\nBatch sampler class for multipack" + }, + { + "objectID": "docs/api/utils.samplers.multipack.html#classes", + "href": "docs/api/utils.samplers.multipack.html#classes", + "title": "utils.samplers.multipack", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for multipack\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n self,\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=16,\n **kwargs,\n)\nBatch sampler class for multipack" + }, + { + "objectID": "docs/api/utils.callbacks.profiler.html", + "href": "docs/api/utils.callbacks.profiler.html", + "title": "utils.callbacks.profiler", + "section": "", + "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps." + }, + { + "objectID": "docs/api/utils.callbacks.profiler.html#classes", + "href": "docs/api/utils.callbacks.profiler.html#classes", + "title": "utils.callbacks.profiler", + "section": "", + "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps." + }, + { + "objectID": "docs/api/integrations.lm_eval.args.html", + "href": "docs/api/integrations.lm_eval.args.html", + "title": "integrations.lm_eval.args", + "section": "", + "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness" + }, + { + "objectID": "docs/api/integrations.lm_eval.args.html#classes", + "href": "docs/api/integrations.lm_eval.args.html#classes", + "title": "integrations.lm_eval.args", + "section": "", + "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness" + }, + { + "objectID": "docs/api/utils.data.pretraining.html", + "href": "docs/api/utils.data.pretraining.html", + "title": "utils.data.pretraining", + "section": "", + "text": "utils.data.pretraining\nutils.data.pretraining\ndata handling specific to pretraining" + }, + { + "objectID": "docs/api/evaluate.html", + "href": "docs/api/evaluate.html", + "title": "evaluate", + "section": "", + "text": "evaluate\nModule for evaluating models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None." + }, + { + "objectID": "docs/api/evaluate.html#functions", + "href": "docs/api/evaluate.html#functions", + "title": "evaluate", + "section": "", + "text": "Name\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None." + }, + { + "objectID": "docs/api/utils.dict.html", + "href": "docs/api/utils.dict.html", + "title": "utils.dict", + "section": "", + "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys." + }, + { + "objectID": "docs/api/utils.dict.html#classes", + "href": "docs/api/utils.dict.html#classes", + "title": "utils.dict", + "section": "", + "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys." + }, + { + "objectID": "docs/api/cli.utils.html", + "href": "docs/api/cli.utils.html", + "title": "cli.utils", + "section": "", + "text": "cli.utils\nUtility methods for axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ndownload_file\nDownload a single file and return its processing status.\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\nload_model_and_tokenizer\nHelper function for loading a model and tokenizer specified in the given axolotl\n\n\nstrip_optional_type\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\ncli.utils.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)\nDownload a single file and return its processing status.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfile_info\ntuple\nTuple of (file_path, remote_sha).\nrequired\n\n\nraw_base_url\nstr\nBase URL for raw GitHub content.\nrequired\n\n\ndest_path\nPath\nLocal destination directory.\nrequired\n\n\ndir_prefix\nstr\nDirectory prefix to filter files.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[str, str]\nTuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.\n\n\n\n\n\n\n\ncli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5\n\n\n\n\n\n\n\ncli.utils.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function.\n\n\n\n\n\n\n\ncli.utils.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model and tokenizer specified in the given axolotl\nconfig.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any]\ntransformers model and tokenizer.\n\n\n\n\n\n\n\ncli.utils.strip_optional_type(field_type)\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield_type\ntype | str | None\nType of field for Axolotl CLI command.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nIf the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged." + }, + { + "objectID": "docs/api/cli.utils.html#functions", + "href": "docs/api/cli.utils.html#functions", + "title": "cli.utils", + "section": "", + "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ndownload_file\nDownload a single file and return its processing status.\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\nload_model_and_tokenizer\nHelper function for loading a model and tokenizer specified in the given axolotl\n\n\nstrip_optional_type\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\ncli.utils.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)\nDownload a single file and return its processing status.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfile_info\ntuple\nTuple of (file_path, remote_sha).\nrequired\n\n\nraw_base_url\nstr\nBase URL for raw GitHub content.\nrequired\n\n\ndest_path\nPath\nLocal destination directory.\nrequired\n\n\ndir_prefix\nstr\nDirectory prefix to filter files.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[str, str]\nTuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.\n\n\n\n\n\n\n\ncli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5\n\n\n\n\n\n\n\ncli.utils.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function.\n\n\n\n\n\n\n\ncli.utils.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model and tokenizer specified in the given axolotl\nconfig.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any]\ntransformers model and tokenizer.\n\n\n\n\n\n\n\ncli.utils.strip_optional_type(field_type)\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield_type\ntype | str | None\nType of field for Axolotl CLI command.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nIf the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged." + }, + { + "objectID": "docs/api/prompt_strategies.pygmalion.html", + "href": "docs/api/prompt_strategies.pygmalion.html", + "title": "prompt_strategies.pygmalion", + "section": "", + "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n *args,\n **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)\nPrompter for Pygmalion." + }, + { + "objectID": "docs/api/prompt_strategies.pygmalion.html#classes", + "href": "docs/api/prompt_strategies.pygmalion.html#classes", + "title": "prompt_strategies.pygmalion", + "section": "", + "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n *args,\n **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)\nPrompter for Pygmalion." + }, + { + "objectID": "docs/api/core.training_args.html", + "href": "docs/api/core.training_args.html", + "title": "core.training_args", + "section": "", + "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nMixin class for the Axolotl training args." + }, + { + "objectID": "docs/api/core.training_args.html#classes", + "href": "docs/api/core.training_args.html#classes", + "title": "core.training_args", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n self,\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_optimizer=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n sequence_parallel_degree=1,\n)\nMixin class for the Axolotl training args." + }, + { + "objectID": "docs/api/cli.inference.html", + "href": "docs/api/cli.inference.html", + "title": "cli.inference", + "section": "", + "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat template\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat template\nis (optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string." + }, + { + "objectID": "docs/api/cli.inference.html#functions", + "href": "docs/api/cli.inference.html#functions", + "title": "cli.inference", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat template\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat template\nis (optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string." + }, + { + "objectID": "docs/api/kernels.lora.html", + "href": "docs/api/kernels.lora.html", + "title": "kernels.lora", + "section": "", + "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/quantization states\n\n\n\ntorch.Tensor | None\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\nNone\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n ctx,\n X,\n gate_weight,\n gate_quant,\n gate_A,\n gate_B,\n gate_scale,\n up_weight,\n up_quant,\n up_A,\n up_B,\n up_scale,\n down_weight,\n down_quant,\n down_A,\n down_B,\n down_scale,\n activation_fn,\n activation_fn_backward,\n inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_quant\nobject | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp-projection weight\nrequired\n\n\nup_quant\nobject | None\nUp-projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp-projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp-projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp-projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown-projection weight\nrequired\n\n\ndown_quant\nobject | None\nDown-projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown-projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown-projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown-projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, W_quant, A, B, S)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix\nrequired\n\n\nS\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection tensor\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n ctx,\n X,\n q_weight,\n q_quant,\n q_A,\n q_B,\n q_scale,\n k_weight,\n k_quant,\n k_A,\n k_B,\n k_scale,\n v_weight,\n v_quant,\n v_A,\n v_B,\n v_scale,\n inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weight matrix, quantization state, LoRA A matrix,\n\n\n\nQuantState | None\nLoRA B matrix, and scaling factor. States and matrices may be None if not\n\n\n\ntorch.Tensor | None\navailable.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B" + }, + { + "objectID": "docs/api/kernels.lora.html#classes", + "href": "docs/api/kernels.lora.html#classes", + "title": "kernels.lora", + "section": "", + "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/quantization states\n\n\n\ntorch.Tensor | None\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\nNone\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n ctx,\n X,\n gate_weight,\n gate_quant,\n gate_A,\n gate_B,\n gate_scale,\n up_weight,\n up_quant,\n up_A,\n up_B,\n up_scale,\n down_weight,\n down_quant,\n down_A,\n down_B,\n down_scale,\n activation_fn,\n activation_fn_backward,\n inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_quant\nobject | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp-projection weight\nrequired\n\n\nup_quant\nobject | None\nUp-projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp-projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp-projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp-projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown-projection weight\nrequired\n\n\ndown_quant\nobject | None\nDown-projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown-projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown-projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown-projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, W_quant, A, B, S)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix\nrequired\n\n\nS\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection tensor\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n ctx,\n X,\n q_weight,\n q_quant,\n q_A,\n q_B,\n q_scale,\n k_weight,\n k_quant,\n k_A,\n k_B,\n k_scale,\n v_weight,\n v_quant,\n v_A,\n v_B,\n v_scale,\n inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors" + }, + { + "objectID": "docs/api/kernels.lora.html#functions", + "href": "docs/api/kernels.lora.html#functions", + "title": "kernels.lora", + "section": "", + "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weight matrix, quantization state, LoRA A matrix,\n\n\n\nQuantState | None\nLoRA B matrix, and scaling factor. States and matrices may be None if not\n\n\n\ntorch.Tensor | None\navailable.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B" + }, + { + "objectID": "docs/api/cli.evaluate.html", + "href": "docs/api/cli.evaluate.html", + "title": "cli.evaluate", + "section": "", + "text": "cli.evaluate\nCLI to run evaluation on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired" + }, + { + "objectID": "docs/api/cli.evaluate.html#functions", + "href": "docs/api/cli.evaluate.html#functions", + "title": "cli.evaluate", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired" + }, + { + "objectID": "docs/api/utils.collators.batching.html", + "href": "docs/api/utils.collators.batching.html", + "title": "utils.collators.batching", + "section": "", + "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences. Also\nincludes logic for handling sequence parallelism collation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n self,\n tokenizer,\n model=None,\n padding=True,\n max_length=None,\n pad_to_multiple_of=None,\n label_pad_token_id=-100,\n position_pad_token_id=0,\n return_tensors='pt',\n sequence_parallel_degree=1,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n self,\n tokenizer,\n model=None,\n padding=True,\n max_length=None,\n pad_to_multiple_of=None,\n label_pad_token_id=-100,\n position_pad_token_id=0,\n return_tensors='pt',\n sequence_parallel_degree=1,\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\nsequence_parallel_degree\nint\nThe degree of sequence parallelism. Default to 1 for no sequence parallelism.\n1\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq.apply_sequence_parallelism(\n batch,\n)\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary from parent collator.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nSliced batch dictionary.\n\n\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n self,\n *args,\n multipack_attn=True,\n **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n self,\n tokenizer,\n model=None,\n padding=True,\n max_length=None,\n pad_to_multiple_of=None,\n label_pad_token_id=-100,\n position_pad_token_id=0,\n return_tensors='pt',\n sequence_parallel_degree=1,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadjust_position_ids_for_slice\nAdjust position IDs for a sliced sequence to maintain proper relative positions.\n\n\n\n\n\nutils.collators.batching.adjust_position_ids_for_slice(position_ids, start_idx)\nAdjust position IDs for a sliced sequence to maintain proper relative positions.\nThis handles the case where position IDs might not be contiguous due to sample\npacking." + }, + { + "objectID": "docs/api/utils.collators.batching.html#classes", + "href": "docs/api/utils.collators.batching.html#classes", + "title": "utils.collators.batching", + "section": "", + "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n self,\n tokenizer,\n model=None,\n padding=True,\n max_length=None,\n pad_to_multiple_of=None,\n label_pad_token_id=-100,\n position_pad_token_id=0,\n return_tensors='pt',\n sequence_parallel_degree=1,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n self,\n tokenizer,\n model=None,\n padding=True,\n max_length=None,\n pad_to_multiple_of=None,\n label_pad_token_id=-100,\n position_pad_token_id=0,\n return_tensors='pt',\n sequence_parallel_degree=1,\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\nsequence_parallel_degree\nint\nThe degree of sequence parallelism. Default to 1 for no sequence parallelism.\n1\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq.apply_sequence_parallelism(\n batch,\n)\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary from parent collator.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nSliced batch dictionary.\n\n\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n self,\n *args,\n multipack_attn=True,\n **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n self,\n tokenizer,\n model=None,\n padding=True,\n max_length=None,\n pad_to_multiple_of=None,\n label_pad_token_id=-100,\n position_pad_token_id=0,\n return_tensors='pt',\n sequence_parallel_degree=1,\n)\nCollator for multipack specific to the using the BatchSampler" + }, + { + "objectID": "docs/api/utils.collators.batching.html#functions", + "href": "docs/api/utils.collators.batching.html#functions", + "title": "utils.collators.batching", + "section": "", + "text": "Name\nDescription\n\n\n\n\nadjust_position_ids_for_slice\nAdjust position IDs for a sliced sequence to maintain proper relative positions.\n\n\n\n\n\nutils.collators.batching.adjust_position_ids_for_slice(position_ids, start_idx)\nAdjust position IDs for a sliced sequence to maintain proper relative positions.\nThis handles the case where position IDs might not be contiguous due to sample\npacking." + }, + { + "objectID": "docs/api/prompt_strategies.completion.html", + "href": "docs/api/prompt_strategies.completion.html", + "title": "prompt_strategies.completion", + "section": "", + "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n self,\n *args,\n max_length=None,\n **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion" + }, + { + "objectID": "docs/api/prompt_strategies.completion.html#classes", + "href": "docs/api/prompt_strategies.completion.html#classes", + "title": "prompt_strategies.completion", + "section": "", + "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n self,\n *args,\n max_length=None,\n **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.zephyr.html", + "href": "docs/api/prompt_strategies.dpo.zephyr.html", + "title": "prompt_strategies.dpo.zephyr", + "section": "", + "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr" + }, + { + "objectID": "docs/api/prompt_strategies.metharme.html", + "href": "docs/api/prompt_strategies.metharme.html", + "title": "prompt_strategies.metharme", + "section": "", + "text": "prompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(self, *args, **kwargs)\nPrompter for the Metharme models." + }, + { + "objectID": "docs/api/prompt_strategies.metharme.html#classes", + "href": "docs/api/prompt_strategies.metharme.html#classes", + "title": "prompt_strategies.metharme", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(self, *args, **kwargs)\nPrompter for the Metharme models." + }, + { + "objectID": "docs/api/prompt_strategies.orpo.chat_template.html", + "href": "docs/api/prompt_strategies.orpo.chat_template.html", + "title": "prompt_strategies.orpo.chat_template", + "section": "", + "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(\n self,\n chat_template,\n tokenizer,\n)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n self,\n *args,\n dataset_parser=None,\n **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected" + }, + { + "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes", + "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes", + "title": "prompt_strategies.orpo.chat_template", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(\n self,\n chat_template,\n tokenizer,\n)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n self,\n *args,\n dataset_parser=None,\n **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels" + }, + { + "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions", + "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions", + "title": "prompt_strategies.orpo.chat_template", + "section": "", + "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected" + }, + { + "objectID": "docs/api/prompt_strategies.alpaca_w_system.html", + "href": "docs/api/prompt_strategies.alpaca_w_system.html", + "title": "prompt_strategies.alpaca_w_system", + "section": "", + "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset" + }, + { + "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes", + "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes", + "title": "prompt_strategies.alpaca_w_system", + "section": "", + "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset" + }, + { + "objectID": "docs/api/utils.model_shard_quant.html", + "href": "docs/api/utils.model_shard_quant.html", + "title": "utils.model_shard_quant", + "section": "", + "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n module,\n name,\n value,\n device=None,\n dtype=None,\n skip_names=None,\n to_cpu=False,\n to_meta=False,\n verbose=False,\n quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True." + }, + { + "objectID": "docs/api/utils.model_shard_quant.html#functions", + "href": "docs/api/utils.model_shard_quant.html#functions", + "title": "utils.model_shard_quant", + "section": "", + "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n module,\n name,\n value,\n device=None,\n dtype=None,\n skip_names=None,\n to_cpu=False,\n to_meta=False,\n verbose=False,\n quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True." + }, + { + "objectID": "docs/api/cli.config.html", + "href": "docs/api/cli.config.html", + "title": "cli.config", + "section": "", + "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired" + }, + { + "objectID": "docs/api/cli.config.html#functions", + "href": "docs/api/cli.config.html#functions", + "title": "cli.config", + "section": "", + "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired" + }, + { + "objectID": "docs/api/utils.schemas.enums.html", + "href": "docs/api/utils.schemas.enums.html", + "title": "utils.schemas.enums", + "section": "", + "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset" + }, + { + "objectID": "docs/api/utils.schemas.enums.html#classes", + "href": "docs/api/utils.schemas.enums.html#classes", + "title": "utils.schemas.enums", + "section": "", + "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset" + }, + { + "objectID": "docs/api/cli.preprocess.html", + "href": "docs/api/cli.preprocess.html", + "title": "cli.preprocess", + "section": "", + "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired" + }, + { + "objectID": "docs/api/cli.preprocess.html#functions", + "href": "docs/api/cli.preprocess.html#functions", + "title": "cli.preprocess", + "section": "", + "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired" + }, + { + "objectID": "docs/api/core.chat.messages.html", + "href": "docs/api/core.chat.messages.html", + "title": "core.chat.messages", + "section": "", + "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id" + }, + { + "objectID": "docs/api/core.chat.messages.html#classes", + "href": "docs/api/core.chat.messages.html#classes", + "title": "core.chat.messages", + "section": "", + "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.chat_template.html", + "href": "docs/api/prompt_strategies.dpo.chat_template.html", + "title": "prompt_strategies.dpo.chat_template", + "section": "", + "text": "prompt_strategies.dpo.chat_template\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates." + }, + { + "objectID": "docs/api/utils.schemas.peft.html", + "href": "docs/api/utils.schemas.peft.html", + "title": "utils.schemas.peft", + "section": "", + "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset" + }, + { + "objectID": "docs/api/utils.schemas.peft.html#classes", + "href": "docs/api/utils.schemas.peft.html#classes", + "title": "utils.schemas.peft", + "section": "", + "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset" + }, + { + "objectID": "docs/api/train.html", + "href": "docs/api/train.html", + "title": "train", + "section": "", + "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\ndetermine_resume_checkpoint\nDetermine the checkpoint to resume from based on configuration.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on configuration.\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.determine_resume_checkpoint(cfg)\nDetermine the checkpoint to resume from based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr | None\nPath to the checkpoint to resume from, or None if not resuming.\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(\n cfg,\n model,\n tokenizer,\n train_dataset,\n safe_serialization,\n)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model, safe_serialization)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[HFRLTrainerBuilder | HFCausalTrainerBuilder, PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model, safe_serialization)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training" + }, + { + "objectID": "docs/api/train.html#functions", + "href": "docs/api/train.html#functions", + "title": "train", + "section": "", + "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\ndetermine_resume_checkpoint\nDetermine the checkpoint to resume from based on configuration.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on configuration.\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.determine_resume_checkpoint(cfg)\nDetermine the checkpoint to resume from based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr | None\nPath to the checkpoint to resume from, or None if not resuming.\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(\n cfg,\n model,\n tokenizer,\n train_dataset,\n safe_serialization,\n)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model, safe_serialization)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[HFRLTrainerBuilder | HFCausalTrainerBuilder, PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model, safe_serialization)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training" + }, + { + "objectID": "docs/api/prompt_strategies.messages.chat.html", + "href": "docs/api/prompt_strategies.messages.chat.html", + "title": "prompt_strategies.messages.chat", + "section": "", + "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n self,\n processor,\n message_transform=None,\n formatter=None,\n **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations" + }, + { + "objectID": "docs/api/prompt_strategies.messages.chat.html#classes", + "href": "docs/api/prompt_strategies.messages.chat.html#classes", + "title": "prompt_strategies.messages.chat", + "section": "", + "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n self,\n processor,\n message_transform=None,\n formatter=None,\n **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations" + }, + { + "objectID": "docs/api/prompt_strategies.orcamini.html", + "href": "docs/api/prompt_strategies.orcamini.html", + "title": "prompt_strategies.orcamini", + "section": "", + "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets" + }, + { + "objectID": "docs/api/prompt_strategies.orcamini.html#classes", + "href": "docs/api/prompt_strategies.orcamini.html#classes", + "title": "prompt_strategies.orcamini", + "section": "", + "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n self,\n prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets" + }, + { + "objectID": "docs/api/utils.collators.mm_chat.html", + "href": "docs/api/utils.collators.mm_chat.html", + "title": "utils.collators.mm_chat", + "section": "", + "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n self,\n tokenizer,\n processor,\n return_tensors='pt',\n chat_template=None,\n packing=False,\n max_images=-1,\n padding=True,\n pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages" + }, + { + "objectID": "docs/api/utils.collators.mm_chat.html#classes", + "href": "docs/api/utils.collators.mm_chat.html#classes", + "title": "utils.collators.mm_chat", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n self,\n tokenizer,\n processor,\n return_tensors='pt',\n chat_template=None,\n packing=False,\n max_images=-1,\n padding=True,\n pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages" + }, + { + "objectID": "docs/api/prompt_strategies.kto.llama3.html", + "href": "docs/api/prompt_strategies.kto.llama3.html", + "title": "prompt_strategies.kto.llama3", + "section": "", + "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto" + }, + { + "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions", + "href": "docs/api/prompt_strategies.kto.llama3.html#functions", + "title": "prompt_strategies.kto.llama3", + "section": "", + "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto" + }, + { + "objectID": "docs/api/monkeypatch.attention.mllama.html", + "href": "docs/api/monkeypatch.attention.mllama.html", + "title": "monkeypatch.attention.mllama", + "section": "", + "text": "monkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\n\n\n\nName\nDescription\n\n\n\n\nMllamaTextCrossFlashAttention2\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\n\n\nMllamaTextSelfFlashAttention2\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\n\n\n\n\n\nmonkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(\n self,\n *args,\n **kwargs,\n)\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\nimplements the forward pass using Flash Attention for improved performance.\n\n\n\nmonkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(\n self,\n config,\n layer_idx,\n *args,\n **kwargs,\n)\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\nimplements the forward pass using Flash Attention for improved performance." + }, + { + "objectID": "docs/api/monkeypatch.attention.mllama.html#classes", + "href": "docs/api/monkeypatch.attention.mllama.html#classes", + "title": "monkeypatch.attention.mllama", + "section": "", + "text": "Name\nDescription\n\n\n\n\nMllamaTextCrossFlashAttention2\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\n\n\nMllamaTextSelfFlashAttention2\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\n\n\n\n\n\nmonkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(\n self,\n *args,\n **kwargs,\n)\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\nimplements the forward pass using Flash Attention for improved performance.\n\n\n\nmonkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(\n self,\n config,\n layer_idx,\n *args,\n **kwargs,\n)\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\nimplements the forward pass using Flash Attention for improved performance." + }, + { + "objectID": "docs/api/cli.checks.html", + "href": "docs/api/cli.checks.html", + "title": "cli.checks", + "section": "", + "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved." + }, + { + "objectID": "docs/api/cli.checks.html#functions", + "href": "docs/api/cli.checks.html#functions", + "title": "cli.checks", + "section": "", + "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved." + }, + { + "objectID": "docs/api/monkeypatch.transformers_fa_utils.html", + "href": "docs/api/monkeypatch.transformers_fa_utils.html", + "title": "monkeypatch.transformers_fa_utils", + "section": "", + "text": "monkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\n\n\n\nName\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n query,\n key,\n value,\n target_dtype=None,\n preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone" + }, + { + "objectID": "docs/api/monkeypatch.transformers_fa_utils.html#functions", + "href": "docs/api/monkeypatch.transformers_fa_utils.html#functions", + "title": "monkeypatch.transformers_fa_utils", + "section": "", + "text": "Name\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n query,\n key,\n value,\n target_dtype=None,\n preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone" + }, + { + "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html", + "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html", + "title": "monkeypatch.llama_attn_hijack_xformers", + "section": "", + "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments" + }, + { + "objectID": "docs/api/core.trainers.dpo.trainer.html", + "href": "docs/api/core.trainers.dpo.trainer.html", + "title": "core.trainers.dpo.trainer", + "section": "", + "text": "core.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(\n self,\n *args,\n dataset_tags=None,\n **kwargs,\n)\nExtend the base DPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details." + }, + { + "objectID": "docs/api/core.trainers.dpo.trainer.html#classes", + "href": "docs/api/core.trainers.dpo.trainer.html#classes", + "title": "core.trainers.dpo.trainer", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(\n self,\n *args,\n dataset_tags=None,\n **kwargs,\n)\nExtend the base DPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details." + }, + { + "objectID": "docs/api/prompt_strategies.user_defined.html", + "href": "docs/api/prompt_strategies.user_defined.html", + "title": "prompt_strategies.user_defined", + "section": "", + "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n self,\n system_prompt='',\n field_system='system',\n field_instruction='instruction',\n field_input='input',\n field_output='output',\n format='{instruction} {input} ',\n no_input_format='{instruction} ',\n system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts" + }, + { + "objectID": "docs/api/prompt_strategies.user_defined.html#classes", + "href": "docs/api/prompt_strategies.user_defined.html#classes", + "title": "prompt_strategies.user_defined", + "section": "", + "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n self,\n system_prompt='',\n field_system='system',\n field_instruction='instruction',\n field_input='input',\n field_output='output',\n format='{instruction} {input} ',\n no_input_format='{instruction} ',\n system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n self,\n prompter,\n tokenizer,\n train_on_inputs=False,\n sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts" + }, + { + "objectID": "docs/api/cli.args.html", + "href": "docs/api/cli.args.html", + "title": "cli.args", + "section": "", + "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n self,\n debug=False,\n debug_text_only=False,\n debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(self, prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n self,\n debug=False,\n debug_text_only=False,\n debug_num_examples=1,\n prompter=None,\n download=True,\n iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.TrainerCliArgs(\n self,\n debug=False,\n debug_text_only=False,\n debug_num_examples=0,\n merge_lora=False,\n prompter=None,\n shard=False,\n main_process_port=None,\n num_processes=None,\n)\nDataclass with CLI arguments for axolotl train command." + }, + { + "objectID": "docs/api/cli.args.html#classes", + "href": "docs/api/cli.args.html#classes", + "title": "cli.args", + "section": "", + "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n self,\n debug=False,\n debug_text_only=False,\n debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(self, prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n self,\n debug=False,\n debug_text_only=False,\n debug_num_examples=1,\n prompter=None,\n download=True,\n iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.TrainerCliArgs(\n self,\n debug=False,\n debug_text_only=False,\n debug_num_examples=0,\n merge_lora=False,\n prompter=None,\n shard=False,\n main_process_port=None,\n num_processes=None,\n)\nDataclass with CLI arguments for axolotl train command." + }, + { + "objectID": "docs/api/prompt_strategies.llama2_chat.html", + "href": "docs/api/prompt_strategies.llama2_chat.html", + "title": "prompt_strategies.llama2_chat", + "section": "", + "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n - path: llama_finetune_train.jsonl\n type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(\n self,\n *args,\n **kwargs,\n)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n self,\n name='llama2',\n system=\"[INST] <<SYS>>\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n<</SYS>>\\n\\n\",\n roles=('[INST]', '[/INST]'),\n messages=list(),\n offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n role,\n message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models." + }, + { + "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes", + "href": "docs/api/prompt_strategies.llama2_chat.html#classes", + "title": "prompt_strategies.llama2_chat", + "section": "", + "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(\n self,\n *args,\n **kwargs,\n)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n self,\n name='llama2',\n system=\"[INST] <<SYS>>\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n<</SYS>>\\n\\n\",\n roles=('[INST]', '[/INST]'),\n messages=list(),\n offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n role,\n message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models." + }, + { + "objectID": "docs/api/utils.schemas.config.html", + "href": "docs/api/utils.schemas.config.html", + "title": "utils.schemas.config", + "section": "", + "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nwrapper to valdiate gpu capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nwrapper to valdiate gpu capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options" + }, + { + "objectID": "docs/api/utils.schemas.config.html#classes", + "href": "docs/api/utils.schemas.config.html#classes", + "title": "utils.schemas.config", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nwrapper to valdiate gpu capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nwrapper to valdiate gpu capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options" + }, + { + "objectID": "docs/api/core.trainers.grpo.trainer.html", + "href": "docs/api/core.trainers.grpo.trainer.html", + "title": "core.trainers.grpo.trainer", + "section": "", + "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(self, *args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers" + }, + { + "objectID": "docs/api/core.trainers.grpo.trainer.html#classes", + "href": "docs/api/core.trainers.grpo.trainer.html#classes", + "title": "core.trainers.grpo.trainer", + "section": "", + "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(self, *args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers" + }, + { + "objectID": "docs/api/core.chat.format.chatml.html", + "href": "docs/api/core.chat.format.chatml.html", + "title": "core.chat.format.chatml", + "section": "", + "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents" + }, + { + "objectID": "docs/api/monkeypatch.lora_kernels.html", + "href": "docs/api/monkeypatch.lora_kernels.html", + "title": "monkeypatch.lora_kernels", + "section": "", + "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation." + }, + { + "objectID": "docs/api/monkeypatch.lora_kernels.html#functions", + "href": "docs/api/monkeypatch.lora_kernels.html#functions", + "title": "monkeypatch.lora_kernels", + "section": "", + "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation." + }, + { + "objectID": "docs/api/prompt_strategies.base.html", + "href": "docs/api/prompt_strategies.base.html", + "title": "prompt_strategies.base", + "section": "", + "text": "prompt_strategies.base\nprompt_strategies.base\nmodule for base dataset transform strategies" + }, { "objectID": "docs/rlhf.html", "href": "docs/rlhf.html", "title": "RLHF (Beta)", "section": "", - "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)", + "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)", "crumbs": [ "How To Guides", "RLHF (Beta)" @@ -867,7 +2687,7 @@ "href": "docs/rlhf.html#overview", "title": "RLHF (Beta)", "section": "", - "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)", + "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)", "crumbs": [ "How To Guides", "RLHF (Beta)" @@ -887,56 +2707,56 @@ { "objectID": "docs/cli.html", "href": "docs/cli.html", - "title": "CLI Reference", + "title": "Command Line Interface (CLI)", "section": "", - "text": "The Axolotl CLI provides a streamlined interface for training and fine-tuning large language models. This guide covers the CLI commands, their usage, and common examples.", + "text": "The Axolotl CLI provides a streamlined interface for training and fine-tuning large language models. This guide covers\nthe CLI commands, their usage, and common examples.", "crumbs": [ "Getting Started", - "CLI Reference" + "Command Line Interface (CLI)" ] }, { "objectID": "docs/cli.html#basic-commands", "href": "docs/cli.html#basic-commands", - "title": "CLI Reference", + "title": "Command Line Interface (CLI)", "section": "Basic Commands", "text": "Basic Commands\nAll Axolotl commands follow this general structure:\naxolotl <command> [config.yml] [options]\nThe config file can be local or a URL to a raw YAML file.", "crumbs": [ "Getting Started", - "CLI Reference" + "Command Line Interface (CLI)" ] }, { "objectID": "docs/cli.html#command-reference", "href": "docs/cli.html#command-reference", - "title": "CLI Reference", + "title": "Command Line Interface (CLI)", "section": "Command Reference", "text": "Command Reference\n\nfetch\nDownloads example configurations and deepspeed configs to your local machine.\n# Get example YAML files\naxolotl fetch examples\n\n# Get deepspeed config files\naxolotl fetch deepspeed_configs\n\n# Specify custom destination\naxolotl fetch examples --dest path/to/folder\n\n\npreprocess\nPreprocesses and tokenizes your dataset before training. This is recommended for large datasets.\n# Basic preprocessing\naxolotl preprocess config.yml\n\n# Preprocessing with one GPU\nCUDA_VISIBLE_DEVICES=\"0\" axolotl preprocess config.yml\n\n# Debug mode to see processed examples\naxolotl preprocess config.yml --debug\n\n# Debug with limited examples\naxolotl preprocess config.yml --debug --debug-num-examples 5\nConfiguration options:\ndataset_prepared_path: Local folder for saving preprocessed data\npush_dataset_to_hub: HuggingFace repo to push preprocessed data (optional)\n\n\ntrain\nTrains or fine-tunes a model using the configuration specified in your YAML file.\n# Basic training\naxolotl train config.yml\n\n# Train and set/override specific options\naxolotl train config.yml \\\n --learning-rate 1e-4 \\\n --micro-batch-size 2 \\\n --num-epochs 3\n\n# Training without accelerate\naxolotl train config.yml --no-accelerate\n\n# Resume training from checkpoint\naxolotl train config.yml --resume-from-checkpoint path/to/checkpoint\nIt is possible to run sweeps over multiple hyperparameters by passing in a sweeps config.\n# Basic training with sweeps\naxolotl train config.yml --sweep path/to/sweep.yaml\nExample sweep config:\n_:\n # This section is for dependent variables we need to fix\n - load_in_8bit: false\n load_in_4bit: false\n adapter: lora\n - load_in_8bit: true\n load_in_4bit: false\n adapter: lora\n\n# These are independent variables\nlearning_rate: [0.0003, 0.0006]\nlora_r:\n - 16\n - 32\nlora_alpha:\n - 16\n - 32\n - 64\n\n\ninference\nRuns inference using your trained model in either CLI or Gradio interface mode.\n# CLI inference with LoRA\naxolotl inference config.yml --lora-model-dir=\"./outputs/lora-out\"\n\n# CLI inference with full model\naxolotl inference config.yml --base-model=\"./completed-model\"\n\n# Gradio web interface\naxolotl inference config.yml --gradio \\\n --lora-model-dir=\"./outputs/lora-out\"\n\n# Inference with input from file\ncat prompt.txt | axolotl inference config.yml \\\n --base-model=\"./completed-model\"\n\n\nmerge-lora\nMerges trained LoRA adapters into the base model.\n# Basic merge\naxolotl merge-lora config.yml\n\n# Specify LoRA directory (usually used with checkpoints)\naxolotl merge-lora config.yml --lora-model-dir=\"./lora-output/checkpoint-100\"\n\n# Merge using CPU (if out of GPU memory)\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora config.yml\nConfiguration options:\ngpu_memory_limit: Limit GPU memory usage\nlora_on_cpu: Load LoRA weights on CPU\n\n\nmerge-sharded-fsdp-weights\nMerges sharded FSDP model checkpoints into a single combined checkpoint.\n# Basic merge\naxolotl merge-sharded-fsdp-weights config.yml\n\n\nevaluate\nEvaluates a model’s performance using metrics specified in the config.\n# Basic evaluation\naxolotl evaluate config.yml\n\n\nlm-eval\nRuns LM Evaluation Harness on your model.\n# Basic evaluation\naxolotl lm-eval config.yml\nConfiguration options:\n# List of tasks to evaluate\nlm_eval_tasks:\n - arc_challenge\n - hellaswag\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results", "crumbs": [ "Getting Started", - "CLI Reference" + "Command Line Interface (CLI)" ] }, { "objectID": "docs/cli.html#legacy-cli-usage", "href": "docs/cli.html#legacy-cli-usage", - "title": "CLI Reference", + "title": "Command Line Interface (CLI)", "section": "Legacy CLI Usage", "text": "Legacy CLI Usage\nWhile the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:\n# Preprocess\npython -m axolotl.cli.preprocess config.yml\n\n# Train\naccelerate launch -m axolotl.cli.train config.yml\n\n# Inference\naccelerate launch -m axolotl.cli.inference config.yml \\\n --lora_model_dir=\"./outputs/lora-out\"\n\n# Gradio interface\naccelerate launch -m axolotl.cli.inference config.yml \\\n --lora_model_dir=\"./outputs/lora-out\" --gradio\n\n\n\n\n\n\nImportant\n\n\n\nWhen overriding CLI parameters in the legacy CLI, use same notation as in yaml file (e.g., --lora_model_dir).\nNote: This differs from the new Click-based CLI, which uses dash notation (e.g., --lora-model-dir). Keep this in mind if you’re referencing newer documentation or switching between CLI versions.", "crumbs": [ "Getting Started", - "CLI Reference" + "Command Line Interface (CLI)" ] }, { "objectID": "docs/cli.html#remote-compute-with-modal-cloud", "href": "docs/cli.html#remote-compute-with-modal-cloud", - "title": "CLI Reference", + "title": "Command Line Interface (CLI)", "section": "Remote Compute with Modal Cloud", - "text": "Remote Compute with Modal Cloud\nAxolotl supports running training and inference workloads on Modal cloud infrastructure. This is configured using a cloud YAML file alongside your regular Axolotl config.\n\nCloud Configuration\nCreate a cloud config YAML with your Modal settings:\n# cloud_config.yml\nprovider: modal\ngpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4\ngpu_count: 1 # Number of GPUs to use\ntimeout: 86400 # Maximum runtime in seconds (24 hours)\nbranch: main # Git branch to use (optional)\n\nvolumes: # Persistent storage volumes\n - name: axolotl-cache\n mount: /workspace/cache\n - name: axolotl-data\n mount: /workspace/data\n - name: axolotl-artifacts\n mount: /workspace/artifacts\n\nenv: # Environment variables\n - WANDB_API_KEY\n - HF_TOKEN\n\n\nRunning on Modal Cloud\nCommands that support the –cloud flag:\n# Preprocess on cloud\naxolotl preprocess config.yml --cloud cloud_config.yml\n\n# Train on cloud\naxolotl train config.yml --cloud cloud_config.yml\n\n# Train without accelerate on cloud\naxolotl train config.yml --cloud cloud_config.yml --no-accelerate\n\n# Run lm-eval on cloud\naxolotl lm-eval config.yml --cloud cloud_config.yml\n\n\nCloud Configuration Options\nprovider: # compute provider, currently only `modal` is supported\ngpu: # GPU type to use\ngpu_count: # Number of GPUs (default: 1)\nmemory: # RAM in GB (default: 128)\ntimeout: # Maximum runtime in seconds\ntimeout_preprocess: # Preprocessing timeout\nbranch: # Git branch to use\ndocker_tag: # Custom Docker image tag\nvolumes: # List of persistent storage volumes\nenv: # Environment variables to pass\nsecrets: # Secrets to inject", + "text": "Remote Compute with Modal Cloud\nAxolotl supports running training and inference workloads on Modal cloud infrastructure. This is configured using a\ncloud YAML file alongside your regular Axolotl config.\n\nCloud Configuration\nCreate a cloud config YAML with your Modal settings:\n# cloud_config.yml\nprovider: modal\ngpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4\ngpu_count: 1 # Number of GPUs to use\ntimeout: 86400 # Maximum runtime in seconds (24 hours)\nbranch: main # Git branch to use (optional)\n\nvolumes: # Persistent storage volumes\n - name: axolotl-cache\n mount: /workspace/cache\n - name: axolotl-data\n mount: /workspace/data\n - name: axolotl-artifacts\n mount: /workspace/artifacts\n\nenv: # Environment variables\n - WANDB_API_KEY\n - HF_TOKEN\n\n\nRunning on Modal Cloud\nCommands that support the –cloud flag:\n# Preprocess on cloud\naxolotl preprocess config.yml --cloud cloud_config.yml\n\n# Train on cloud\naxolotl train config.yml --cloud cloud_config.yml\n\n# Train without accelerate on cloud\naxolotl train config.yml --cloud cloud_config.yml --no-accelerate\n\n# Run lm-eval on cloud\naxolotl lm-eval config.yml --cloud cloud_config.yml\n\n\nCloud Configuration Options\nprovider: # compute provider, currently only `modal` is supported\ngpu: # GPU type to use\ngpu_count: # Number of GPUs (default: 1)\nmemory: # RAM in GB (default: 128)\ntimeout: # Maximum runtime in seconds\ntimeout_preprocess: # Preprocessing timeout\nbranch: # Git branch to use\ndocker_tag: # Custom Docker image tag\nvolumes: # List of persistent storage volumes\nenv: # Environment variables to pass\nsecrets: # Secrets to inject", "crumbs": [ "Getting Started", - "CLI Reference" + "Command Line Interface (CLI)" ] }, { @@ -944,7 +2764,7 @@ "href": "docs/unsloth.html", "title": "Unsloth", "section": "", - "text": "Overview\nUnsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over standard industry baselines.\n\n\n\n\n\n\nImportant\n\n\n\nDue to breaking changes in transformers v4.48.0, users will need to downgrade to <=v4.47.1 to use this patch.\nThis will later be deprecated in favor of LoRA Optimizations.\n\n\n\n\nInstallation\nThe following will install the correct unsloth and extras from source.\npython scripts/unsloth_install.py | sh\n\n\nUsage\nAxolotl exposes a few configuration options to try out unsloth and get most of the performance gains.\nOur unsloth integration is currently limited to the following model architectures: - llama\nThese options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning\nunsloth_lora_mlp: true\nunsloth_lora_qkv: true\nunsloth_lora_o: true\nThese options are composable and can be used with multi-gpu finetuning\nunsloth_cross_entropy_loss: true\nunsloth_rms_norm: true\nunsloth_rope: true\n\n\nLimitations\n\nSingle GPU only; e.g. no multi-gpu support\nNo deepspeed or FSDP support (requires multi-gpu)\nLoRA + QLoRA support only. No full fine tunes or fp8 support.\nLimited model architecture support. Llama, Phi, Gemma, Mistral only\nNo MoE support.", + "text": "Overview\nUnsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over\nstandard industry baselines.\n\n\n\n\n\n\nImportant\n\n\n\nDue to breaking changes in transformers v4.48.0, users will need to downgrade to <=v4.47.1 to use this patch.\nThis will later be deprecated in favor of LoRA Optimizations.\n\n\n\n\nInstallation\nThe following will install the correct unsloth and extras from source.\npython scripts/unsloth_install.py | sh\n\n\nUsage\nAxolotl exposes a few configuration options to try out unsloth and get most of the performance gains.\nOur unsloth integration is currently limited to the following model architectures:\n- llama\nThese options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning\nunsloth_lora_mlp: true\nunsloth_lora_qkv: true\nunsloth_lora_o: true\nThese options are composable and can be used with multi-gpu finetuning\nunsloth_cross_entropy_loss: true\nunsloth_rms_norm: true\nunsloth_rope: true\n\n\nLimitations\n\nSingle GPU only; e.g. no multi-gpu support\nNo deepspeed or FSDP support (requires multi-gpu)\nLoRA + QLoRA support only. No full fine tunes or fp8 support.\nLimited model architecture support. Llama, Phi, Gemma, Mistral only\nNo MoE support.", "crumbs": [ "Advanced Features", "Unsloth" @@ -977,7 +2797,7 @@ "href": "docs/fsdp_qlora.html#usage", "title": "FDSP + QLoRA", "section": "Usage", - "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip] See the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.", + "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.", "crumbs": [ "Advanced Features", "FDSP + QLoRA" @@ -1021,7 +2841,7 @@ "href": "docs/dataset_preprocessing.html", "title": "Dataset Preprocessing", "section": "", - "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside the dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps (e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent training parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a default path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly setting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed data is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined prompt template. Because the trainer cannot readily detect these changes, we cannot change the calculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set and change your prompt templating logic, it may not pick up the changes you made and you will be training over the old prompt.", + "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.", "crumbs": [ "Core Concepts", "Dataset Preprocessing" @@ -1032,7 +2852,7 @@ "href": "docs/dataset_preprocessing.html#overview", "title": "Dataset Preprocessing", "section": "", - "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside the dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps (e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent training parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a default path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly setting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed data is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined prompt template. Because the trainer cannot readily detect these changes, we cannot change the calculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set and change your prompt templating logic, it may not pick up the changes you made and you will be training over the old prompt.", + "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.", "crumbs": [ "Core Concepts", "Dataset Preprocessing" @@ -1109,7 +2929,7 @@ "href": "docs/custom_integrations.html#spectrum", "title": "Custom Integrations", "section": "Spectrum", - "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models. By identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n title={Spectrum: Targeted Training on Signal to Noise Ratio},\n author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n year={2024},\n eprint={2406.06623},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here", + "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n title={Spectrum: Targeted Training on Signal to Noise Ratio},\n author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n year={2024},\n eprint={2406.06623},\n archivePrefix={arXiv},\n primaryClass={cs.LG},\n url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here", "crumbs": [ "Advanced Features", "Custom Integrations" diff --git a/sitemap.xml b/sitemap.xml index c861e03fb..aec01cd11 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,162 +2,670 @@ https://axolotl-ai-cloud.github.io/axolotl/examples/colab-notebooks/colab-axolotl-example.html - 2025-03-21T15:02:57.839Z + 2025-03-21T17:28:24.757Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/stepwise_supervised.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/template_free.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/tokenized.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/nccl.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.755Z https://axolotl-ai-cloud.github.io/axolotl/docs/amd_hpc.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.751Z https://axolotl-ai-cloud.github.io/axolotl/docs/config.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.755Z https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html - 2025-03-21T15:02:57.837Z + 2025-03-21T17:28:24.755Z https://axolotl-ai-cloud.github.io/axolotl/docs/torchao.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.756Z https://axolotl-ai-cloud.github.io/axolotl/docs/reward_modelling.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.756Z https://axolotl-ai-cloud.github.io/axolotl/docs/input_output.html - 2025-03-21T15:02:57.837Z + 2025-03-21T17:28:24.755Z https://axolotl-ai-cloud.github.io/axolotl/docs/multimodal.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.755Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.callbacks.mlflow_.html + 2025-03-21T17:28:59.673Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.trainer_fsdp_optim.html + 2025-03-21T17:28:59.268Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.data.batch_dataset_fetcher.html + 2025-03-21T17:28:59.284Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.stepwise_supervised.html + 2025-03-21T17:28:58.971Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.mistral_attn_hijack_flash.html + 2025-03-21T17:28:59.216Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.dpo.user_defined.html + 2025-03-21T17:28:59.018Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.liger.args.html + 2025-03-21T17:28:59.589Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.training.html + 2025-03-21T17:28:59.455Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/datasets.html + 2025-03-21T17:28:58.475Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/kernels.geglu.html + 2025-03-21T17:28:59.155Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.llama_attn_hijack_flash.html + 2025-03-21T17:28:59.200Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.sweeps.html + 2025-03-21T17:28:58.808Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.freeze.html + 2025-03-21T17:28:59.358Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.multipack.html + 2025-03-21T17:28:59.217Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.main.html + 2025-03-21T17:28:58.705Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.trainers.trl.html + 2025-03-21T17:28:58.879Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.dpo.passthrough.html + 2025-03-21T17:28:59.020Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.chat.format.llama3x.html + 2025-03-21T17:28:58.659Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.datasets.transforms.chat_builder.html + 2025-03-21T17:28:58.673Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.kto.user_defined.html + 2025-03-21T17:28:59.038Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.collators.mamba.html + 2025-03-21T17:28:59.647Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.base.html + 2025-03-21T17:28:59.574Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.bench.html + 2025-03-21T17:28:59.350Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/kernels.swiglu.html + 2025-03-21T17:28:59.165Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.chat.format.shared.html + 2025-03-21T17:28:58.660Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.cut_cross_entropy.args.html + 2025-03-21T17:28:59.577Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.datasets.chat.html + 2025-03-21T17:28:58.665Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.callbacks.lisa.html + 2025-03-21T17:28:59.669Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.grokfast.optimizer.html + 2025-03-21T17:28:59.578Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.alpaca_chat.html + 2025-03-21T17:28:58.920Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.alpaca_instruct.html + 2025-03-21T17:28:58.922Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.kto.chatml.html + 2025-03-21T17:28:59.036Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.integrations.html + 2025-03-21T17:28:59.497Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.trl.html + 2025-03-21T17:28:59.485Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_tokenizers.html + 2025-03-21T17:28:58.531Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.data.sft.html + 2025-03-21T17:28:59.432Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schedulers.html + 2025-03-21T17:28:59.399Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.chat_templates.html + 2025-03-21T17:28:59.332Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.models.html + 2025-03-21T17:28:59.315Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.dpo.chatml.html + 2025-03-21T17:28:59.015Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.distributed.html + 2025-03-21T17:28:59.418Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.utils.html + 2025-03-21T17:28:59.256Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.utils.html + 2025-03-21T17:28:59.509Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.llama_expand_mask.html + 2025-03-21T17:28:59.226Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/common.datasets.html + 2025-03-21T17:28:59.615Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/logging_config.html + 2025-03-21T17:28:58.536Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/kernels.quantize.html + 2025-03-21T17:28:59.173Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.model.html + 2025-03-21T17:28:59.450Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.stablelm_attn_hijack_flash.html + 2025-03-21T17:28:59.265Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.mixtral.html + 2025-03-21T17:28:59.286Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.tokenization.html + 2025-03-21T17:28:59.322Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.kd.trainer.html + 2025-03-21T17:28:59.586Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.datasets.html + 2025-03-21T17:28:59.473Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.collators.core.html + 2025-03-21T17:28:59.617Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.btlm_attn_hijack_flash.html + 2025-03-21T17:28:59.258Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.optimizers.adopt.html + 2025-03-21T17:28:59.429Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.input_output.html + 2025-03-21T17:28:58.967Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/index.html + 2025-03-21T17:28:58.397Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.cloud.modal_.html + 2025-03-21T17:28:58.849Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.dpo.llama3.html + 2025-03-21T17:28:59.005Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.train.html + 2025-03-21T17:28:58.714Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.trainer_builder.html + 2025-03-21T17:28:58.552Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.callbacks.perplexity.html + 2025-03-21T17:28:59.664Z https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html - 2025-03-21T15:02:57.835Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/inference.html - 2025-03-21T15:02:57.837Z + 2025-03-21T17:28:24.755Z https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.755Z - https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html - 2025-03-21T15:02:57.835Z - - - https://axolotl-ai-cloud.github.io/axolotl/docs/lr_groups.html - 2025-03-21T15:02:57.837Z - - - https://axolotl-ai-cloud.github.io/axolotl/TODO.html - 2025-03-21T15:02:57.833Z - - - https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html - 2025-03-21T15:02:57.852Z - - - https://axolotl-ai-cloud.github.io/axolotl/index.html - 2025-03-21T15:02:57.849Z - - - https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html - 2025-03-21T15:02:57.853Z - - - https://axolotl-ai-cloud.github.io/axolotl/FAQS.html - 2025-03-21T15:02:57.833Z + https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html + 2025-03-21T17:28:24.756Z https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.755Z + + + https://axolotl-ai-cloud.github.io/axolotl/FAQS.html + 2025-03-21T17:28:24.750Z + + + https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html + 2025-03-21T17:28:24.772Z + + + https://axolotl-ai-cloud.github.io/axolotl/index.html + 2025-03-21T17:28:24.768Z + + + https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html + 2025-03-21T17:28:24.771Z + + + https://axolotl-ai-cloud.github.io/axolotl/TODO.html + 2025-03-21T17:28:24.750Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/lr_groups.html + 2025-03-21T17:28:24.755Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html - 2025-03-21T15:02:57.835Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/batch_vs_grad.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.751Z https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html - 2025-03-21T15:02:57.837Z + 2025-03-21T17:28:24.755Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.lora_embeddings.html + 2025-03-21T17:28:59.341Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/kernels.utils.html + 2025-03-21T17:28:59.174Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.chat_template.html + 2025-03-21T17:28:58.906Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/convert.html + 2025-03-21T17:28:58.489Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/common.const.html + 2025-03-21T17:28:59.598Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.cloud.base.html + 2025-03-21T17:28:58.843Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.relora.html + 2025-03-21T17:28:59.224Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.lora.html + 2025-03-21T17:28:59.337Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.merge_lora.html + 2025-03-21T17:28:58.782Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.bradley_terry.llama3.html + 2025-03-21T17:28:59.062Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.merge_sharded_fsdp_weights.html + 2025-03-21T17:28:58.794Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.spectrum.args.html + 2025-03-21T17:28:59.595Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/models.mamba.modeling_mamba.html + 2025-03-21T17:28:59.616Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/common.architectures.html + 2025-03-21T17:28:59.597Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.trainer.html + 2025-03-21T17:28:59.375Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.callbacks.comet_.html + 2025-03-21T17:28:59.677Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.llama_patch_multipack.html + 2025-03-21T17:28:59.259Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.gradient_checkpointing.unsloth.html + 2025-03-21T17:28:59.436Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.trainers.base.html + 2025-03-21T17:28:58.867Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.unsloth_.html + 2025-03-21T17:28:59.276Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.samplers.multipack.html + 2025-03-21T17:28:59.658Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.callbacks.profiler.html + 2025-03-21T17:28:59.668Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/integrations.lm_eval.args.html + 2025-03-21T17:28:59.592Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.data.pretraining.html + 2025-03-21T17:28:59.431Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/evaluate.html + 2025-03-21T17:28:58.468Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.dict.html + 2025-03-21T17:28:59.422Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.utils.html + 2025-03-21T17:28:58.840Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.pygmalion.html + 2025-03-21T17:28:58.989Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.training_args.html + 2025-03-21T17:28:58.633Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.inference.html + 2025-03-21T17:28:58.773Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/kernels.lora.html + 2025-03-21T17:28:59.144Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.evaluate.html + 2025-03-21T17:28:58.722Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.collators.batching.html + 2025-03-21T17:28:59.643Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.completion.html + 2025-03-21T17:28:58.961Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.dpo.zephyr.html + 2025-03-21T17:28:59.017Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.metharme.html + 2025-03-21T17:28:58.978Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.orpo.chat_template.html + 2025-03-21T17:28:59.058Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.alpaca_w_system.html + 2025-03-21T17:28:58.934Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.model_shard_quant.html + 2025-03-21T17:28:59.346Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.config.html + 2025-03-21T17:28:58.759Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.enums.html + 2025-03-21T17:28:59.504Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.preprocess.html + 2025-03-21T17:28:58.802Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.chat.messages.html + 2025-03-21T17:28:58.656Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.dpo.chat_template.html + 2025-03-21T17:28:58.995Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.peft.html + 2025-03-21T17:28:59.482Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/train.html + 2025-03-21T17:28:58.458Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.messages.chat.html + 2025-03-21T17:28:58.993Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.orcamini.html + 2025-03-21T17:28:58.982Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.collators.mm_chat.html + 2025-03-21T17:28:59.652Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.kto.llama3.html + 2025-03-21T17:28:59.028Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.attention.mllama.html + 2025-03-21T17:28:59.283Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.checks.html + 2025-03-21T17:28:58.742Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.transformers_fa_utils.html + 2025-03-21T17:28:59.275Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.llama_attn_hijack_xformers.html + 2025-03-21T17:28:59.201Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.trainers.dpo.trainer.html + 2025-03-21T17:28:58.886Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.user_defined.html + 2025-03-21T17:28:58.942Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/cli.args.html + 2025-03-21T17:28:58.735Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.llama2_chat.html + 2025-03-21T17:28:58.955Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/utils.schemas.config.html + 2025-03-21T17:28:59.443Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.trainers.grpo.trainer.html + 2025-03-21T17:28:58.890Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/core.chat.format.chatml.html + 2025-03-21T17:28:58.657Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/monkeypatch.lora_kernels.html + 2025-03-21T17:28:59.248Z + + + https://axolotl-ai-cloud.github.io/axolotl/docs/api/prompt_strategies.base.html + 2025-03-21T17:28:58.891Z https://axolotl-ai-cloud.github.io/axolotl/docs/rlhf.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.756Z https://axolotl-ai-cloud.github.io/axolotl/docs/cli.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.751Z https://axolotl-ai-cloud.github.io/axolotl/docs/unsloth.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.756Z https://axolotl-ai-cloud.github.io/axolotl/docs/fsdp_qlora.html - 2025-03-21T15:02:57.835Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset_preprocessing.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/mac.html - 2025-03-21T15:02:57.837Z + 2025-03-21T17:28:24.755Z https://axolotl-ai-cloud.github.io/axolotl/docs/docker.html - 2025-03-21T15:02:57.835Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/ray-integration.html - 2025-03-21T15:02:57.838Z + 2025-03-21T17:28:24.756Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/index.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/pretraining.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html - 2025-03-21T15:02:57.834Z + 2025-03-21T17:28:24.752Z diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html index 3ec160e99..aaf5a2033 100644 --- a/src/axolotl/integrations/LICENSE.html +++ b/src/axolotl/integrations/LICENSE.html @@ -143,7 +143,7 @@ ul.task-list li input[type="checkbox"] { + @@ -404,16 +410,60 @@ ul.task-list li input[type="checkbox"] {

    AXOLOTL COMMUNITY LICENSE AGREEMENT

    -

    This Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms and conditions set forth in this Agreement.

    +

    This Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and +any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms +and conditions set forth in this Agreement.

      -
    1. Definitions 1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement. 1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl, which may be licensed separately by their respective authors and/or licensors. 1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which permits Plugin Integrations to integrate with the Axolotl service.
    2. -
    3. Grant of License 2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge, publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions: - Licensee must comply with all the terms and conditions of this Agreement. - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial portions of the Software. 2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.
    4. -
    5. Restrictions 3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for free or for sale any services, platform, or equivalent to third parties for the purposes of allowing such third parties to fine-tune artificial intelligence models. 3.2 Licensee shall not: - Use the Software for any illegal or unauthorized purpose. - Reverse engineer, decompile, or disassemble the Software. - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software. - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the Software or interfere with any third-party use of the Software. 3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.
    6. -
    7. Intellectual Property Rights 4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to Licensee.
    8. -
    9. Disclaimer of Warranty 5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    10. -
    11. Termination 6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any copies in its possession.
    12. -
    13. Governing Law 7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California, without regards to conflicts of laws provisions thereof.
    14. -
    15. Entire Agreement 8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms on a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be bound by the terms and conditions of this Agreement.
    16. +
    17. Definitions +1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement. +1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl, +which may be licensed separately by their respective authors and/or licensors. +1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at +https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which +permits Plugin Integrations to integrate with the Axolotl service.
    18. +
    19. Grant of License +2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge, +publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions: +- Licensee must comply with all the terms and conditions of this Agreement. +- Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial +portions of the Software. +2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.
    20. +
    21. Restrictions +3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for +free or for sale any services, platform, or equivalent to third parties for the purposes of allowing such +third parties to fine-tune artificial intelligence models. +3.2 Licensee shall not: +- Use the Software for any illegal or unauthorized purpose. +- Reverse engineer, decompile, or disassemble the Software. +- Remove or modify any copyright, trademark, or other proprietary notices contained in the Software. +- Use the Software in a way that could damage, disable, overburden, or impair the functionality of the +Software or interfere with any third-party use of the Software. +3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.
    22. +
    23. Intellectual Property Rights +4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee +acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to +Licensee.
    24. +
    25. Disclaimer of Warranty +5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE.
    26. +
    27. Termination +6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and +conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any +copies in its possession.
    28. +
    29. Governing Law +7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California, +without regards to conflicts of laws provisions thereof.
    30. +
    31. Entire Agreement +8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter +hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning +the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and +Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms +on a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any +material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be +bound by the terms and conditions of this Agreement.

    This Agreement was last updated on August 23, 2024.

    diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html index ffb183a5f..e82b1021a 100644 --- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html +++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html @@ -143,7 +143,7 @@ ul.task-list li input[type="checkbox"] { + @@ -397,7 +403,8 @@ ul.task-list li input[type="checkbox"] {

    Acknowledgements

    -

    Portions of this Cut Cross Entropy Software may utilize the following copyrighted material, the use of which is hereby acknowledged.

    +

    Portions of this Cut Cross Entropy Software may utilize the following copyrighted +material, the use of which is hereby acknowledged.


    PyTorch

    From PyTorch:
    diff --git a/styles.css b/styles.css
    index 749ff4366..c5b0768fa 100644
    --- a/styles.css
    +++ b/styles.css
    @@ -14,7 +14,7 @@
     h1 {
         font-family: var(--font-title);
         font-weight: 400;
    -    font-size: 5rem;
    +    font-size: 3rem;
         line-height: 1.1;
         letter-spacing: -0.05em;
         font-feature-settings: "ss01" on;
    @@ -24,7 +24,7 @@ h1 {
     h2 {
         font-family: var(--font-title);
         font-weight: 500;
    -    font-size: 2rem;
    +    font-size: 1.5rem;
         line-height: 1.2;
         letter-spacing: -0.03em;
         font-feature-settings: "ss01" on;
    @@ -35,7 +35,7 @@ h3,
     h4 {
         font-family: var(--font-body);
         font-weight: 400;
    -    font-size: 1.5rem;
    +    font-size: 1.25rem;
         line-height: 1.5;
         letter-spacing: -0.02em;
     }
    @@ -191,3 +191,87 @@ code span.er {
         color: #5cb85c !important;
         text-decoration: none !important;
     }
    +
    +/* API Documentation Styling */
    +
    +/* Improve docstring section rendering */
    +.level3 p {
    +    white-space: pre-line !important;
    +}
    +
    +/* Format docstring sections */
    +.level3 p strong {
    +    display: block;
    +    margin-top: 1em;
    +    font-weight: bold;
    +    color: var(--cyan);
    +}
    +
    +/* Add spacing after sections */
    +.level3 p:has(strong) {
    +    margin-bottom: 0.5em;
    +}
    +
    +/* Format Args and Returns sections */
    +p:has(code) {
    +    line-height: 1.6;
    +}
    +
    +/* Function signatures */
    +.sourceCode {
    +    margin-bottom: 1.5em;
    +}
    +
    +/* Parameter tables */
    +.doc-section-parameters table,
    +.doc-section-returns table {
    +    margin-top: 1em;
    +    margin-bottom: 1.5em;
    +}
    +
    +/* Make parameter and returns headers smaller */
    +h2.anchored[data-anchor-id="parameters"],
    +h2.anchored[data-anchor-id="returns"],
    +.doc-section-parameters h4,
    +.doc-section-returns h4 {
    +    font-size: 1.25rem;
    +    margin-top: 2rem;
    +    margin-bottom: 1rem;
    +    color: var(--lime);
    +    border-bottom: 1px solid var(--lime);
    +    padding-bottom: 0.3rem;
    +    font-family: var(--font-body);
    +    font-weight: 500;
    +    letter-spacing: normal;
    +}
    +
    +/* Style documentation tables */
    +table {
    +    width: 100%;
    +    margin-bottom: 1.5rem;
    +    border-collapse: collapse;
    +}
    +
    +table th {
    +    background-color: #1a1a1a;
    +    padding: 0.5rem 1rem;
    +    border-bottom: 2px solid var(--greige-600);
    +    text-align: left;
    +}
    +
    +table td {
    +    padding: 0.5rem 1rem;
    +    border-bottom: 1px solid var(--greige-600);
    +}
    +
    +/* Code in table cells */
    +table td code {
    +    background-color: transparent !important;
    +    padding: 0;
    +}
    +
    +/* Improve spacing in parameter and return tables */
    +.doc-section-parameters,
    +.doc-section-returns {
    +    margin-top: 1rem;
    +}