diff --git a/.nojekyll b/.nojekyll index b97d766d2..29ef7690d 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -6761c9c5 \ No newline at end of file +3fe6ba85 \ No newline at end of file diff --git a/docs/config-reference.html b/docs/config-reference.html index 8bce7bbf4..33c7b4927 100644 --- a/docs/config-reference.html +++ b/docs/config-reference.html @@ -637,1081 +637,1082 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); dpo_label_smoothing: float | None dpo_norm_loss: bool | None dpo_padding_free: bool | None - -# A list of one or more datasets to finetune the model with -datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None - # For SFTDataset: - # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory - path: str | None - # name of dataset split to load from - split: str | None - # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] - type: str | UserDefinedPrompterType | None - # For UserDefinedPrompterType: - # Custom user instruction prompt - system_prompt: str | None - # Use {system} as key to be replaced - system_format: str | None - field_system: str | None - field_instruction: str | None - field_input: str | None - field_output: str | None - - # Customizable to be single line or multi-line. Use {instruction}/{input} as key to - # be replaced. 'format' can include {input} - format: str | None - # 'no_input_format' cannot include {input} - no_input_format: str | None - # For `completion` datsets only, uses the provided field instead of `text` column - field: str | None - input_transform: str | None - # split dataset into N pieces (use with shards_idx) - shards: int | None - # the index of sharded dataset to use - shards_idx: int | None - # process dataset in N sequential chunks for memory efficiency (exclusive with - # `shards`) - preprocess_shards: int | None - conversation: str | None - - # The name of the chat template to use for training, following values are supported: - # tokenizer_default: Uses the chat template that is available in the - # tokenizer_config.json. If the chat template is not available in the tokenizer, it - # will raise an error. This is the default. - # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates - # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. - # tokenizer_default_fallback_*: where * is the name of the chat template to fallback - # to if the tokenizer does not have a chat template else default to tokenizer. E.g. - # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat - # template. The custom jinja template should be provided in the chat_template_jinja - # field. - chat_template: ChatTemplate | str | None - # Custom jinja chat template. Used only if `chat_template: jinja` or empty. - chat_template_jinja: str | None - # path to source data files - data_files: str | list[str] | None - input_format: str | None - # name of dataset configuration to load - name: str | None - # defines the datatype when path is a file - ds_type: str | None - field: str | None - field_human: str | None - field_model: str | None - # Key containing the messages (default: "messages") - field_messages: str | None - # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON - # schema](https://json-schema.org/learn/getting-started-step-by-step). - field_tools: str | None - - message_field_role: str | None - - message_field_content: str | None - # Mapping of properties from the input dataset to the chat template. (default: - # message_property_mappings={'role':'role', 'content':'content'}) If a property exists - # in the template but not in this mapping, the system will attempt to load it directly - # from the message using the property name as the key. Example: In the mapping below, - # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and - # used as 'content' in the chat template. - message_property_mappings: dict[str, str] | None - # The key in the message turn that indicates via boolean whether tokens of a turn - # should be considered for training. Useful to selectively train on certain turns - # besides the `roles_to_train`. - message_field_training: str | None - # The key in the message turn that contains the training details. Useful to - # selectively train on certain tokens in a turn. The value of the key is a List[Dict] - # containing `begin_offset` (start character index in content), `end_offset` (end - # character index in content), and `train` (boolean whether to train). - message_field_training_detail: str | None - # (for Qwen3 template only) Whether to split the assistant content based on a - # reasoning trace inside delimited tags - split_thinking: bool | None - logprobs_field: str | None - temperature: float | None - # Roles to train on. The tokens from these roles will be considered for the loss. - roles_to_train: list[str] | None - # Which EOS tokens to train on in the conversation. Possible values are: all: train on - # all EOS tokens, turn (default): train on the EOS token at the end of each trainable - # turn, last: train on the last EOS token in the conversation - train_on_eos: Literal['all', 'turn', 'last'] | None - # Roles mapping in the messages. The format is {target_role: [source_roles]}. All - # source roles will be mapped to the target role. The default is: user: ["human", - # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"] - roles: dict[str, list[str]] | None - # Whether to drop the system turn from the dataset. Only works with chat_template. - # This does not drop the default system message from chat_template if it exists. If - # you wish to, we recommend using a custom jinja template with the default system - # message removed or adding a system turn with empty content. - drop_system_message: bool | None - # Trust remote code for untrusted source - trust_remote_code: bool | None = False - # The specific revision of the dataset to use when loading from the Hugging Face Hub. - # This can be a commit hash, tag, or branch name. If not specified, the latest version - # will be used. This parameter is ignored for local datasets. - revision: str | None - - # For DPODataset: - path: str | None - split: str | None - type: UserDefinedDPOType | str | None - # For UserDefinedDPOType: - field_system: str | None - field_prompt: str | None - field_chosen: str | None - field_rejected: str | None - prompt_format: str | None - chosen_format: str | None - rejected_format: str | None - data_files: list[str] | None - revision: str | None - field_messages: str | None - - # For KTODataset: - path: str | None - split: str | None - type: UserDefinedKTOType | str | None - # For UserDefinedKTOType: - field_system: str | None - field_prompt: str | None - field_completion: str | None - field_label: bool | None - prompt_format: str | None - completion_format: str | None - data_files: list[str] | None - trust_remote_code: bool | None = False - revision: str | None - - # For StepwiseSupervisedDataset: - path: str | None - split: str | None - data_files: list[str] | None - revision: str | None - step_separator: str | None - max_completion_length: int | None - train_on_last_step_only: bool | None - -# A list of one or more datasets to eval the model with. You can use either -# test_datasets, or val_set_size, but not both. -test_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None - # For SFTDataset: - # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory - path: str | None - # name of dataset split to load from - split: str | None - # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] - type: str | UserDefinedPrompterType | None - # For UserDefinedPrompterType: - # Custom user instruction prompt - system_prompt: str | None - # Use {system} as key to be replaced - system_format: str | None - field_system: str | None - field_instruction: str | None - field_input: str | None - field_output: str | None - - # Customizable to be single line or multi-line. Use {instruction}/{input} as key to - # be replaced. 'format' can include {input} - format: str | None - # 'no_input_format' cannot include {input} - no_input_format: str | None - # For `completion` datsets only, uses the provided field instead of `text` column - field: str | None - input_transform: str | None - # split dataset into N pieces (use with shards_idx) - shards: int | None - # the index of sharded dataset to use - shards_idx: int | None - # process dataset in N sequential chunks for memory efficiency (exclusive with - # `shards`) - preprocess_shards: int | None - conversation: str | None - - # The name of the chat template to use for training, following values are supported: - # tokenizer_default: Uses the chat template that is available in the - # tokenizer_config.json. If the chat template is not available in the tokenizer, it - # will raise an error. This is the default. - # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates - # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. - # tokenizer_default_fallback_*: where * is the name of the chat template to fallback - # to if the tokenizer does not have a chat template else default to tokenizer. E.g. - # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat - # template. The custom jinja template should be provided in the chat_template_jinja - # field. - chat_template: ChatTemplate | str | None - # Custom jinja chat template. Used only if `chat_template: jinja` or empty. - chat_template_jinja: str | None - # path to source data files - data_files: str | list[str] | None - input_format: str | None - # name of dataset configuration to load - name: str | None - # defines the datatype when path is a file - ds_type: str | None - field: str | None - field_human: str | None - field_model: str | None - # Key containing the messages (default: "messages") - field_messages: str | None - # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON - # schema](https://json-schema.org/learn/getting-started-step-by-step). - field_tools: str | None - - message_field_role: str | None - - message_field_content: str | None - # Mapping of properties from the input dataset to the chat template. (default: - # message_property_mappings={'role':'role', 'content':'content'}) If a property exists - # in the template but not in this mapping, the system will attempt to load it directly - # from the message using the property name as the key. Example: In the mapping below, - # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and - # used as 'content' in the chat template. - message_property_mappings: dict[str, str] | None - # The key in the message turn that indicates via boolean whether tokens of a turn - # should be considered for training. Useful to selectively train on certain turns - # besides the `roles_to_train`. - message_field_training: str | None - # The key in the message turn that contains the training details. Useful to - # selectively train on certain tokens in a turn. The value of the key is a List[Dict] - # containing `begin_offset` (start character index in content), `end_offset` (end - # character index in content), and `train` (boolean whether to train). - message_field_training_detail: str | None - # (for Qwen3 template only) Whether to split the assistant content based on a - # reasoning trace inside delimited tags - split_thinking: bool | None - logprobs_field: str | None - temperature: float | None - # Roles to train on. The tokens from these roles will be considered for the loss. - roles_to_train: list[str] | None - # Which EOS tokens to train on in the conversation. Possible values are: all: train on - # all EOS tokens, turn (default): train on the EOS token at the end of each trainable - # turn, last: train on the last EOS token in the conversation - train_on_eos: Literal['all', 'turn', 'last'] | None - # Roles mapping in the messages. The format is {target_role: [source_roles]}. All - # source roles will be mapped to the target role. The default is: user: ["human", - # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"] - roles: dict[str, list[str]] | None - # Whether to drop the system turn from the dataset. Only works with chat_template. - # This does not drop the default system message from chat_template if it exists. If - # you wish to, we recommend using a custom jinja template with the default system - # message removed or adding a system turn with empty content. - drop_system_message: bool | None - # Trust remote code for untrusted source - trust_remote_code: bool | None = False - # The specific revision of the dataset to use when loading from the Hugging Face Hub. - # This can be a commit hash, tag, or branch name. If not specified, the latest version - # will be used. This parameter is ignored for local datasets. - revision: str | None - - # For DPODataset: - path: str | None - split: str | None - type: UserDefinedDPOType | str | None - # For UserDefinedDPOType: - field_system: str | None - field_prompt: str | None - field_chosen: str | None - field_rejected: str | None - prompt_format: str | None - chosen_format: str | None - rejected_format: str | None - data_files: list[str] | None - revision: str | None - field_messages: str | None - - # For KTODataset: - path: str | None - split: str | None - type: UserDefinedKTOType | str | None - # For UserDefinedKTOType: - field_system: str | None - field_prompt: str | None - field_completion: str | None - field_label: bool | None - prompt_format: str | None - completion_format: str | None - data_files: list[str] | None - trust_remote_code: bool | None = False - revision: str | None - - # For StepwiseSupervisedDataset: - path: str | None - split: str | None - data_files: list[str] | None - revision: str | None - step_separator: str | None - max_completion_length: int | None - train_on_last_step_only: bool | None - -# If false, the datasets will not be shuffled and will keep their original order in -# `datasets`. The same applies to the `test_datasets` option and the -# `pretraining_dataset` option. Default is true. -shuffle_merged_datasets: bool | None = True -# Axolotl attempts to save the dataset as an arrow after packing the data together so -# subsequent training attempts load faster, relative path -dataset_prepared_path: str | None -# Num shards for whole dataset -dataset_shard_num: int | None -# Index of shard to use for whole dataset -dataset_shard_idx: int | None -skip_prepare_dataset: bool | None = False - -# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize -pretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None - # For PretrainingDataset: - name: str | None - path: str | None - split: str | None = train - text_column: str | None = text - type: str | None = pretrain - trust_remote_code: bool | None = False - data_files: str | None - skip: int | None - - # For SFTDataset: - # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory - path: str | None - # name of dataset split to load from - split: str | None - # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] - type: str | UserDefinedPrompterType | None - # For UserDefinedPrompterType: - # Custom user instruction prompt - system_prompt: str | None - # Use {system} as key to be replaced - system_format: str | None - field_system: str | None - field_instruction: str | None - field_input: str | None - field_output: str | None - - # Customizable to be single line or multi-line. Use {instruction}/{input} as key to - # be replaced. 'format' can include {input} - format: str | None - # 'no_input_format' cannot include {input} - no_input_format: str | None - # For `completion` datsets only, uses the provided field instead of `text` column - field: str | None - input_transform: str | None - # split dataset into N pieces (use with shards_idx) - shards: int | None - # the index of sharded dataset to use - shards_idx: int | None - # process dataset in N sequential chunks for memory efficiency (exclusive with - # `shards`) - preprocess_shards: int | None - conversation: str | None - - # The name of the chat template to use for training, following values are supported: - # tokenizer_default: Uses the chat template that is available in the - # tokenizer_config.json. If the chat template is not available in the tokenizer, it - # will raise an error. This is the default. - # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates - # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. - # tokenizer_default_fallback_*: where * is the name of the chat template to fallback - # to if the tokenizer does not have a chat template else default to tokenizer. E.g. - # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat - # template. The custom jinja template should be provided in the chat_template_jinja - # field. - chat_template: ChatTemplate | str | None - # Custom jinja chat template. Used only if `chat_template: jinja` or empty. - chat_template_jinja: str | None - # path to source data files - data_files: str | list[str] | None - input_format: str | None - # name of dataset configuration to load - name: str | None - # defines the datatype when path is a file - ds_type: str | None - field: str | None - field_human: str | None - field_model: str | None - # Key containing the messages (default: "messages") - field_messages: str | None - # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON - # schema](https://json-schema.org/learn/getting-started-step-by-step). - field_tools: str | None - - message_field_role: str | None - - message_field_content: str | None - # Mapping of properties from the input dataset to the chat template. (default: - # message_property_mappings={'role':'role', 'content':'content'}) If a property exists - # in the template but not in this mapping, the system will attempt to load it directly - # from the message using the property name as the key. Example: In the mapping below, - # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and - # used as 'content' in the chat template. - message_property_mappings: dict[str, str] | None - # The key in the message turn that indicates via boolean whether tokens of a turn - # should be considered for training. Useful to selectively train on certain turns - # besides the `roles_to_train`. - message_field_training: str | None - # The key in the message turn that contains the training details. Useful to - # selectively train on certain tokens in a turn. The value of the key is a List[Dict] - # containing `begin_offset` (start character index in content), `end_offset` (end - # character index in content), and `train` (boolean whether to train). - message_field_training_detail: str | None - # (for Qwen3 template only) Whether to split the assistant content based on a - # reasoning trace inside delimited tags - split_thinking: bool | None - logprobs_field: str | None - temperature: float | None - # Roles to train on. The tokens from these roles will be considered for the loss. - roles_to_train: list[str] | None - # Which EOS tokens to train on in the conversation. Possible values are: all: train on - # all EOS tokens, turn (default): train on the EOS token at the end of each trainable - # turn, last: train on the last EOS token in the conversation - train_on_eos: Literal['all', 'turn', 'last'] | None - # Roles mapping in the messages. The format is {target_role: [source_roles]}. All - # source roles will be mapped to the target role. The default is: user: ["human", - # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"] - roles: dict[str, list[str]] | None - # Whether to drop the system turn from the dataset. Only works with chat_template. - # This does not drop the default system message from chat_template if it exists. If - # you wish to, we recommend using a custom jinja template with the default system - # message removed or adding a system turn with empty content. - drop_system_message: bool | None - # Trust remote code for untrusted source - trust_remote_code: bool | None = False - # The specific revision of the dataset to use when loading from the Hugging Face Hub. - # This can be a commit hash, tag, or branch name. If not specified, the latest version - # will be used. This parameter is ignored for local datasets. - revision: str | None - -# The maximum number of processes to use while preprocessing your input dataset. This -# defaults to `os.cpu_count()` if not set. -dataset_processes: int | None = 4 -# Deduplicates datasets and test_datasets with identical entries -dataset_exact_deduplication: bool | None -# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking -# too much storage -dataset_keep_in_memory: bool | None -dataloader_pin_memory: bool | None -dataloader_num_workers: int | None -dataloader_prefetch_factor: int | None -dataloader_drop_last: bool | None - -accelerator_config: dict[str, Any] | None - -remove_unused_columns: bool | None - -# Push prepared dataset to hub - repo_org/repo_name -push_dataset_to_hub: str | None -# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private -# datasets. Required to be true when used in combination with `push_dataset_to_hub` -hf_use_auth_token: bool | None - -device: Any | None -# Passed through to transformers when loading the model when launched without -# accelerate. Use `sequential` when training w/ model parallelism to limit memory -device_map: Any | None -world_size: int | None -# Don't mess with this, it's here for accelerate and torchrun -local_rank: int | None -ddp: bool | None - -# Seed for reproducibility -seed: int | None -# Advanced DDP Arguments - timeout -ddp_timeout: int | None -# Advanced DDP Arguments - bucket cap in MB -ddp_bucket_cap_mb: int | None -# Advanced DDP Arguments - broadcast buffers -ddp_broadcast_buffers: bool | None -ddp_find_unused_parameters: bool | None - -# Approximate number of predictions sent to wandb depending on batch size. Enabled above -# 0. Default is 0 -eval_table_size: int | None -# Total number of tokens generated for predictions sent to wandb. Default is 128 -eval_max_new_tokens: int | None -# Whether to run causal language model evaluation for metrics in -# `eval_causal_lm_metrics` -do_causal_lm_eval: bool | None -# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', -# 'chrf', 'perplexity'] -eval_causal_lm_metrics: list[str] | None -do_bench_eval: bool | None -bench_dataset: str | None -bench_split: str | None -metric_for_best_model: str | None -greater_is_better: bool | None - -# High loss value, indicating the learning has broken down (a good estimate is ~2 times -# the loss at the start of training) -loss_watchdog_threshold: float | None -# Number of high-loss steps in a row before the trainer aborts (default: 3) -loss_watchdog_patience: int | None - -gc_steps: int | None - -# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. -# require >=ampere -bf16: Literal['auto'] | bool | None = auto -# Use CUDA fp16 -fp16: bool | None -fp8: bool | None -# No AMP (automatic mixed precision) - require >=ampere -bfloat16: bool | None -# No AMP (automatic mixed precision) -float16: bool | None -# Use CUDA tf32 - require >=ampere -tf32: bool | None -float32: bool | None - -# Whether to use gradient checkpointing. Available options are: true, false, 'offload', -# 'offload_disk'. -# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing -gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False -# Additional kwargs to pass to the trainer for gradient checkpointing -gradient_checkpointing_kwargs: dict[str, Any] | None - -unfrozen_parameters: list[str] | None - -# The maximum length of an input to train with, this should typically be less than 2048 -# as most models have a token/context limit of 2048 -sequence_len: int = 512 -# The maximum length of an input for evaluation. If not specified, defaults to -# sequence_len -eval_sequence_len: int | None -min_sample_len: int | None -# maximum prompt length for RL training -max_prompt_len: int = 512 -# Use efficient multi-packing with block diagonal attention and per sequence -# position_ids. Recommend set to 'true' -sample_packing: bool | None -# The number of samples packed at a time. Increasing the following values helps with -# packing, but usually only slightly (<%1.) -sample_packing_group_size: int | None = 100000 -# The number of samples which can be packed into one sequence. Increase if using a large -# sequence_len with many short samples. -sample_packing_bin_size: int | None = 200 -# Whether to pack samples sequentially -sample_packing_sequentially: bool | None -# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or -# 'forkserver' -sample_packing_mp_start_method: str | None -# Set to 'false' if getting errors during eval with sample_packing on -eval_sample_packing: bool | None -# Pad inputs so each step uses constant sized buffers. This will reduce memory -# fragmentation and may prevent OOMs, by re-using memory more efficiently -pad_to_sequence_len: bool | None -# Whether to use sequential sampling for curriculum learning -curriculum_sampling: bool | None -multipack_real_batches: bool | None -# whether to concatenate samples during pretraining -pretraining_sample_concatenation: bool | None - -# Use batch flattening for speedups when not using sample_packing -batch_flattening: Literal['auto'] | bool | None - -use_pose: bool | None -pose_split_on_token_ids: list[int] | None -pose_max_context_len: int | None -pose_num_chunks: int | None - -pretrain_multipack_buffer_size: int | None = 10000 -# whether to prevent cross attention for packed sequences during pretraining -pretrain_multipack_attn: bool | None = True - -# Whether to use xformers attention patch https://github.com/facebookresearch/xformers -xformers_attention: bool | None -# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/ -# torch.nn.functional.scaled_dot_product_attention.html -sdp_attention: bool | None -# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf -s2_attention: bool | None -flex_attention: bool | None -flex_attn_compile_kwargs: dict[str, Any] | None -# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention -flash_attention: bool | None -# Whether to use flash-attention cross entropy implementation - advanced use only -flash_attn_cross_entropy: bool | None -# Whether to use flash-attention rms norm implementation - advanced use only -flash_attn_rms_norm: bool | None -# Whether to fuse QKV into a single operation -flash_attn_fuse_qkv: bool | None -# Whether to fuse part of the MLP into a single operation -flash_attn_fuse_mlp: bool | None -# Whether to use bettertransformers -flash_optimum: bool | None - -eager_attention: bool | None - -unsloth_cross_entropy_loss: bool | None -unsloth_lora_mlp: bool | None -unsloth_lora_qkv: bool | None -unsloth_lora_o: bool | None -unsloth_rms_norm: bool | None -unsloth_rope: bool | None - -# Apply custom LoRA autograd functions and activation function Triton kernels for speed -# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html -lora_mlp_kernel: bool | None -# Apply custom LoRA autograd functions and activation function Triton kernels for speed -# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html -lora_qkv_kernel: bool | None -# Apply custom LoRA autograd functions and activation function Triton kernels for speed -# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html -lora_o_kernel: bool | None - -# Whether to use chunked cross entropy loss for memory efficiency -chunked_cross_entropy: bool | None -# Number of chunks to use for chunked cross entropy loss -chunked_cross_entropy_num_chunks: int | None - -llama4_linearized_experts: bool | None - -# Deepspeed config path. e.g., deepspeed_configs/zero3.json -deepspeed: str | dict[str, Any] | None -# FSDP configuration -fsdp: list[str] | None -# FSDP configuration options -fsdp_config: dict[str, Any] | None -fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None - -# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for -# no eval. -val_set_size: float | None = 0.0 - -# Set to a divisor of the number of GPUs available to split sequences into chunks of -# equal size. Use in long context training to prevent OOM when sequences cannot fit into -# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each -# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized -# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more -# details. -sequence_parallel_degree: int | None -# Optional; strides across the key dimension. Larger values use more memory but should -# make training faster. Must evenly divide the number of KV heads in your model. -heads_k_stride: int | None -# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to -# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing -# case. -ring_attn_func: RingAttnFunc | None - -# Add or change special tokens. If you add tokens here, you don't need to add them to -# the `tokens` list. -special_tokens: SpecialTokensConfig | None - # For SpecialTokensConfig: - bos_token: str | None - eos_token: str | None - pad_token: str | None - unk_token: str | None - additional_special_tokens: list[str] | None - -# Add extra tokens to the tokenizer -tokens: list[str] | None -# Mapping token_id to new_token_string to override reserved added_tokens in the -# tokenizer. Only works for tokens that are not part of the base vocab (aka are -# added_tokens). Can be checked if they exist in tokenizer.json added_tokens. -added_tokens_overrides: dict[int, str] | None - -# Whether to use torch.compile and which backend to use. setting to `auto` will enable -# torch compile when torch>=2.5.1 -torch_compile: Literal['auto'] | bool | None -# Backend to use for torch.compile -torch_compile_backend: str | None -torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None - -# Maximum number of iterations to train for. It precedes num_epochs which means that if -# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => -# `num_epochs: 2` and `max_steps: 100` will train for 100 steps -max_steps: int | None -# Number of warmup steps. Cannot use with warmup_ratio -warmup_steps: int | None -# Warmup ratio. Cannot use with warmup_steps -warmup_ratio: float | None -# Leave empty to eval at each epoch, integer for every N steps. float for fraction of -# total steps -eval_steps: int | float | None -# Number of times per epoch to run evals, mutually exclusive with eval_steps -evals_per_epoch: int | None -# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer -# from `eval_steps` -eval_strategy: str | None -# Leave empty to save at each epoch, integer for every N steps. float for fraction of -# total steps -save_steps: int | float | None -# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps -saves_per_epoch: int | None -# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better -# result is achieved, leave empty to infer from `save_steps` -save_strategy: str | None -# Checkpoints saved at a time -save_total_limit: int | None -# Logging frequency -logging_steps: int | None -# Stop training after this many evaluation losses have increased in a row. https://huggi -# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin -# gCallback -early_stopping_patience: int | None -load_best_model_at_end: bool | None = False -# Save only the model weights, skipping the optimizer. Using this means you can't resume -# from checkpoints. -save_only_model: bool | None = False -# Use tensorboard for logging -use_tensorboard: bool | None -# Enable the pytorch profiler to capture the first N steps of training to the -# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more -# information. Snapshots can be visualized @ https://pytorch.org/memory_viz -profiler_steps: int | None -# bool of whether to include tokens trainer per second in the training metrics. This -# iterates over the entire dataset once, so it takes some time. -include_tokens_per_second: bool | None - -# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to -# add noise to embeddings. Currently only supported on Llama and Mistral -neftune_noise_alpha: float | None - -# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to -# `beta` in `ORPOConfig` due to trl mapping. -orpo_alpha: float | None -# Weighting of NLL term in loss from RPO paper -rpo_alpha: float | None -# Target reward margin for the SimPO loss -simpo_gamma: float | None -# Weight of the BC regularizer -cpo_alpha: float | None - -# Factor for desirable loss term in KTO loss -kto_desirable_weight: float | None -# Factor for undesirable loss term in KTO loss -kto_undesirable_weight: float | None -# The beta parameter for the RL training -rl_beta: float | None - -# Defines the max memory usage per gpu on the system. Passed through to transformers -# when loading the model. -max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None -# Limit the memory for all available GPUs to this amount (if an integer, expressed in -# gigabytes); default: unset -gpu_memory_limit: int | str | None -# Whether to use low_cpu_mem_usage -low_cpu_mem_usage: bool | None - -# The name of the chat template to use for training, following values are supported: -# tokenizer_default: Uses the chat template that is available in the -# tokenizer_config.json. If the chat template is not available in the tokenizer, it will -# raise an error. This is the default value. -# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates -# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. -# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. -# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not -# available in the tokenizer. jinja: Uses a custom jinja template for the chat template. -# The custom jinja template should be provided in the chat_template_jinja field. The -# selected chat template will be saved to the tokenizer_config.json for easier -# inferencing -chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None -# Custom jinja template for chat template. This will be only used if chat_template is -# set to `jinja` or `null` (in which case chat_template is automatically set to -# `jinja`). Default is null. -chat_template_jinja: str | None -# Additional kwargs to pass to the chat template. This is useful for customizing the -# chat template. For example, you can pass `thinking=False` to add a generation prompt -# to the chat template. -chat_template_kwargs: dict[str, Any] | None -# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the -# boundaries between conversation turns. For example: ['/INST', '</s>', -# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is -# useful for templates that use multiple delimiter tokens. -eot_tokens: list[str] | None -# Changes the default system message. Currently only supports chatml. -default_system_message: str | None - -fix_untrained_tokens: int | list[int] | None - -is_preprocess: bool | None -preprocess_iterable: bool | None - -# Total number of tokens - internal use -total_num_tokens: int | None -total_supervised_tokens: int | None -# You can set these packing optimizations AFTER starting a training at least once. The -# trainer will provide recommended values for these values. -sample_packing_eff_est: float | None -axolotl_config_path: str | None - -# Internal use only - Used to identify which the model is based on -is_falcon_derived_model: bool | None -# Internal use only - Used to identify which the model is based on -is_llama_derived_model: bool | None -# Internal use only - Used to identify which the model is based on. Please note that if -# you set this to true, `padding_side` will be set to 'left' by default -is_mistral_derived_model: bool | None -# Internal use only - Used to identify which the model is based on -is_qwen_derived_model: bool | None - -# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available -# plugins or doc below for more details. -# https://docs.axolotl.ai/docs/custom_integrations.html -plugins: list[str] | None - -# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This -# can also be a relative path to a model on disk -base_model: str (required) -# If the base_model repo on hf hub doesn't include configuration .json files, You can -# set that here, or leave this empty to default to base_model -base_model_config: str | None -cls_model_config: str | None -# Optional tokenizer configuration path in case you want to use a different tokenizer -# than the one defined in the base model -tokenizer_config: str | None -# use_fast option for tokenizer loading from_pretrained, default to True -tokenizer_use_fast: bool | None -# Whether to use the legacy tokenizer setting, defaults to True -tokenizer_legacy: bool | None -# Whether to use mistral-common tokenizer. If set to True, it will use the mistral- -# common tokenizer. -tokenizer_use_mistral_common: bool | None -# Corresponding tokenizer for the model AutoTokenizer is a good choice -tokenizer_type: str | None -# transformers processor class -processor_type: str | None -# Trust remote code for untrusted source -trust_remote_code: bool | None - -# Where to save the full-finetuned model to -output_dir: str = ./model-out -# push checkpoints to hub -hub_model_id: str | None -# how to push checkpoints to hub -hub_strategy: str | None -# Save model as safetensors (require safetensors package). Default True -save_safetensors: bool | None = True - -# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer -load_in_8bit: bool | None = False -# Use bitsandbytes 4 bit -load_in_4bit: bool | None = False - -# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in -# original model -adapter: str | None -# If you already have a lora model trained that you want to load, put that here. This -# means after training, if you want to test the model, you should set this to the value -# of `output_dir`. Note that if you merge an adapter to the base model, a new -# subdirectory `merged` will be created under the `output_dir`. -lora_model_dir: str | None -lora_r: int | None -lora_alpha: int | None -lora_fan_in_fan_out: bool | None -lora_target_modules: str | list[str] | None -# If true, will target all linear modules -lora_target_linear: bool | None -# If you added new tokens to the tokenizer, you may need to save some LoRA modules -# because they need to know the new tokens. For LLaMA and Mistral, you need to save -# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts -# tokens to embeddings, and `lm_head` converts embeddings to token probabilities. -lora_modules_to_save: list[str] | None -lora_dropout: float | None = 0.0 -# The layer indices to transform, otherwise, apply to all layers -peft_layers_to_transform: list[int] | None -peft_layers_pattern: list[str] | None - -peft: PeftConfig | None - # For PeftConfig: - # Configuration options for loftq initialization for LoRA - loftq_config: LoftQConfig | None - # For LoftQConfig: - # typically 4 bits - loftq_bits: int = 4 - -# Whether to use DoRA. -peft_use_dora: bool | None -# Whether to use RSLoRA. -peft_use_rslora: bool | None -# List of layer indices to replicate. -peft_layer_replication: list[tuple[int, int]] | None -# How to initialize LoRA weights. Default to True which is MS original implementation. -peft_init_lora_weights: bool | str | None - -# load qlora model in sharded format for FSDP using answer.ai technique. -qlora_sharded_model_loading: bool | None = False -# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it -# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge -lora_on_cpu: bool | None -# Whether you are training a 4-bit GPTQ quantized model -gptq: bool | None -# optional overrides to the bnb 4bit quantization configuration -bnb_config_kwargs: dict[str, Any] | None - -# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4. -loraplus_lr_ratio: float | None -# loraplus learning rate for lora embedding layers. Default value is 1e-6. -loraplus_lr_embedding: float | None = 1e-06 - -merge_lora: bool | None - -# Number of steps per ReLoRA restart -relora_steps: int | None -# Number of per-restart warmup steps -relora_warmup_steps: int | None -# Number of anneal steps for each relora cycle -relora_anneal_steps: int | None -# threshold for optimizer magnitude when pruning -relora_prune_ratio: float | None -# True to perform lora weight merges on cpu during restarts, for modest gpu memory -# savings -relora_cpu_offload: bool | None - -# If greater than 1, backpropagation will be skipped and the gradients will be -# accumulated for the given number of steps. -gradient_accumulation_steps: int | None = 1 -# The number of samples to include in each batch. This is the number of samples sent to -# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps -micro_batch_size: int | None = 1 -# Total batch size, we do not recommended setting this manually -batch_size: int | None -# per gpu micro batch size for evals, defaults to value of micro_batch_size -eval_batch_size: int | None - -# whether to find batch size that fits in memory. Passed to underlying transformers -# Trainer -auto_find_batch_size: bool | None - -# Whether to mask out or include the human's prompt from the training labels -train_on_inputs: bool | None = False -# Group similarly sized data to minimize padding. May be slower to start, as it must -# download and sort the entire dataset. Note that training loss may have an oscillating -# pattern with this enabled. -group_by_length: bool | None - -learning_rate: str | float (required) -embedding_lr: float | None -embedding_lr_scale: float | None -# Specify weight decay -weight_decay: float | None = 0.0 -# Specify optimizer -optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED -# Dictionary of arguments to pass to the optimizer -optim_args: str | dict[str, Any] | None -# The target modules to optimize, i.e. the module names that you would like to train, -# right now this is used only for GaLore algorithm -optim_target_modules: list[str] | Literal['all_linear'] | None -# Path to torch distx for optim 'adamw_anyprecision' -torchdistx_path: str | None -lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE -# Specify a scheduler and kwargs to use with the optimizer -lr_scheduler_kwargs: dict[str, Any] | None -lr_quadratic_warmup: bool | None -# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of -# peak lr -cosine_min_lr_ratio: float | None -# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means -# start cosine_min_lr at 80% of training step -cosine_constant_lr_ratio: float | None -# Learning rate div factor -lr_div_factor: float | None - -lr_groups: list[LrGroup] | None - # For LrGroup: - name: str (required) - modules: list[str] (required) - lr: float (required) - -# adamw hyperparams -adam_epsilon: float | None -# only used for CAME Optimizer -adam_epsilon2: float | None -# adamw hyperparams -adam_beta1: float | None -# adamw hyperparams -adam_beta2: float | None -# only used for CAME Optimizer -adam_beta3: float | None -# Gradient clipping max norm -max_grad_norm: float | None -num_epochs: float = 1.0 - -use_wandb: bool | None -# Set the name of your wandb run -wandb_name: str | None -# Set the ID of your wandb run -wandb_run_id: str | None -# "offline" to save run metadata locally and not sync to the server, "disabled" to turn -# off wandb -wandb_mode: str | None -# Your wandb project name -wandb_project: str | None -# A wandb Team name if using a Team -wandb_entity: str | None -wandb_watch: str | None -# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only -# at the end of training -wandb_log_model: str | None - -use_mlflow: bool | None -# URI to mlflow -mlflow_tracking_uri: str | None -# Your experiment name -mlflow_experiment_name: str | None -# Your run name -mlflow_run_name: str | None -# set to true to copy each saved checkpoint on each save to mlflow artifact registry -hf_mlflow_log_artifacts: bool | None - -# Enable or disable Comet integration. -use_comet: bool | None -# API key for Comet. Recommended to set via `comet login`. -comet_api_key: str | None -# Workspace name in Comet. Defaults to the user's default workspace. -comet_workspace: str | None -# Project name in Comet. Defaults to Uncategorized. -comet_project_name: str | None -# Identifier for the experiment. Used to append data to an existing experiment or -# control the key of new experiments. Default to a random key. -comet_experiment_key: str | None -# Create a new experiment ("create") or log to an existing one ("get"). Default -# ("get_or_create") auto-selects based on configuration. -comet_mode: str | None -# Set to True to log data to Comet server, or False for offline storage. Default is -# True. -comet_online: bool | None -# Dictionary for additional configuration settings, see the doc for more details. -comet_experiment_config: dict[str, Any] | None - -# the number of activate layers in LISA -lisa_n_layers: int | None -# how often to switch layers in LISA -lisa_step_interval: int | None -# path under the model to access the layers -lisa_layers_attribute: str | None = model.layers - -gradio_title: str | None -gradio_share: bool | None -gradio_server_name: str | None -gradio_server_port: int | None -gradio_max_new_tokens: int | None -gradio_temperature: float | None - -use_ray: bool = False -ray_run_name: str | None -ray_num_workers: int = 1 -resources_per_worker: dict - -# The size of the image to resize to. It can be an integer (resized into padded-square -# image) or a tuple (width, height).If not provided, we will attempt to load from -# preprocessor.size, otherwise, images won't be resized. -image_size: int | tuple[int, int] | None -# The resampling algorithm to use for image resizing. Default is bilinear. Please refer -# to PIL.Image.Resampling for more details. -image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None - -# optional overrides to the base model configuration -overrides_of_model_config: dict[str, Any] | None -# optional overrides the base model loading from_pretrained -overrides_of_model_kwargs: dict[str, Any] | None -# If you want to specify the type of model to load, AutoModelForCausalLM is a good -# choice too -type_of_model: str | None -# You can specify to choose a specific model revision from huggingface hub -revision_of_model: str | None - -max_packed_sequence_len: int | None -rope_scaling: Any | None -noisy_embedding_alpha: float | None -dpo_beta: float | None -evaluation_strategy: str | None +dpo_generate_during_eval: bool | None + +# A list of one or more datasets to finetune the model with +datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None + # For SFTDataset: + # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory + path: str | None + # name of dataset split to load from + split: str | None + # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] + type: str | UserDefinedPrompterType | None + # For UserDefinedPrompterType: + # Custom user instruction prompt + system_prompt: str | None + # Use {system} as key to be replaced + system_format: str | None + field_system: str | None + field_instruction: str | None + field_input: str | None + field_output: str | None + + # Customizable to be single line or multi-line. Use {instruction}/{input} as key to + # be replaced. 'format' can include {input} + format: str | None + # 'no_input_format' cannot include {input} + no_input_format: str | None + # For `completion` datsets only, uses the provided field instead of `text` column + field: str | None + input_transform: str | None + # split dataset into N pieces (use with shards_idx) + shards: int | None + # the index of sharded dataset to use + shards_idx: int | None + # process dataset in N sequential chunks for memory efficiency (exclusive with + # `shards`) + preprocess_shards: int | None + conversation: str | None + + # The name of the chat template to use for training, following values are supported: + # tokenizer_default: Uses the chat template that is available in the + # tokenizer_config.json. If the chat template is not available in the tokenizer, it + # will raise an error. This is the default. + # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates + # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. + # tokenizer_default_fallback_*: where * is the name of the chat template to fallback + # to if the tokenizer does not have a chat template else default to tokenizer. E.g. + # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat + # template. The custom jinja template should be provided in the chat_template_jinja + # field. + chat_template: ChatTemplate | str | None + # Custom jinja chat template. Used only if `chat_template: jinja` or empty. + chat_template_jinja: str | None + # path to source data files + data_files: str | list[str] | None + input_format: str | None + # name of dataset configuration to load + name: str | None + # defines the datatype when path is a file + ds_type: str | None + field: str | None + field_human: str | None + field_model: str | None + # Key containing the messages (default: "messages") + field_messages: str | None + # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON + # schema](https://json-schema.org/learn/getting-started-step-by-step). + field_tools: str | None + + message_field_role: str | None + + message_field_content: str | None + # Mapping of properties from the input dataset to the chat template. (default: + # message_property_mappings={'role':'role', 'content':'content'}) If a property exists + # in the template but not in this mapping, the system will attempt to load it directly + # from the message using the property name as the key. Example: In the mapping below, + # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and + # used as 'content' in the chat template. + message_property_mappings: dict[str, str] | None + # The key in the message turn that indicates via boolean whether tokens of a turn + # should be considered for training. Useful to selectively train on certain turns + # besides the `roles_to_train`. + message_field_training: str | None + # The key in the message turn that contains the training details. Useful to + # selectively train on certain tokens in a turn. The value of the key is a List[Dict] + # containing `begin_offset` (start character index in content), `end_offset` (end + # character index in content), and `train` (boolean whether to train). + message_field_training_detail: str | None + # (for Qwen3 template only) Whether to split the assistant content based on a + # reasoning trace inside delimited tags + split_thinking: bool | None + logprobs_field: str | None + temperature: float | None + # Roles to train on. The tokens from these roles will be considered for the loss. + roles_to_train: list[str] | None + # Which EOS tokens to train on in the conversation. Possible values are: all: train on + # all EOS tokens, turn (default): train on the EOS token at the end of each trainable + # turn, last: train on the last EOS token in the conversation + train_on_eos: Literal['all', 'turn', 'last'] | None + # Roles mapping in the messages. The format is {target_role: [source_roles]}. All + # source roles will be mapped to the target role. The default is: user: ["human", + # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"] + roles: dict[str, list[str]] | None + # Whether to drop the system turn from the dataset. Only works with chat_template. + # This does not drop the default system message from chat_template if it exists. If + # you wish to, we recommend using a custom jinja template with the default system + # message removed or adding a system turn with empty content. + drop_system_message: bool | None + # Trust remote code for untrusted source + trust_remote_code: bool | None = False + # The specific revision of the dataset to use when loading from the Hugging Face Hub. + # This can be a commit hash, tag, or branch name. If not specified, the latest version + # will be used. This parameter is ignored for local datasets. + revision: str | None + + # For DPODataset: + path: str | None + split: str | None + type: UserDefinedDPOType | str | None + # For UserDefinedDPOType: + field_system: str | None + field_prompt: str | None + field_chosen: str | None + field_rejected: str | None + prompt_format: str | None + chosen_format: str | None + rejected_format: str | None + data_files: list[str] | None + revision: str | None + field_messages: str | None + + # For KTODataset: + path: str | None + split: str | None + type: UserDefinedKTOType | str | None + # For UserDefinedKTOType: + field_system: str | None + field_prompt: str | None + field_completion: str | None + field_label: bool | None + prompt_format: str | None + completion_format: str | None + data_files: list[str] | None + trust_remote_code: bool | None = False + revision: str | None + + # For StepwiseSupervisedDataset: + path: str | None + split: str | None + data_files: list[str] | None + revision: str | None + step_separator: str | None + max_completion_length: int | None + train_on_last_step_only: bool | None + +# A list of one or more datasets to eval the model with. You can use either +# test_datasets, or val_set_size, but not both. +test_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None + # For SFTDataset: + # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory + path: str | None + # name of dataset split to load from + split: str | None + # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] + type: str | UserDefinedPrompterType | None + # For UserDefinedPrompterType: + # Custom user instruction prompt + system_prompt: str | None + # Use {system} as key to be replaced + system_format: str | None + field_system: str | None + field_instruction: str | None + field_input: str | None + field_output: str | None + + # Customizable to be single line or multi-line. Use {instruction}/{input} as key to + # be replaced. 'format' can include {input} + format: str | None + # 'no_input_format' cannot include {input} + no_input_format: str | None + # For `completion` datsets only, uses the provided field instead of `text` column + field: str | None + input_transform: str | None + # split dataset into N pieces (use with shards_idx) + shards: int | None + # the index of sharded dataset to use + shards_idx: int | None + # process dataset in N sequential chunks for memory efficiency (exclusive with + # `shards`) + preprocess_shards: int | None + conversation: str | None + + # The name of the chat template to use for training, following values are supported: + # tokenizer_default: Uses the chat template that is available in the + # tokenizer_config.json. If the chat template is not available in the tokenizer, it + # will raise an error. This is the default. + # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates + # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. + # tokenizer_default_fallback_*: where * is the name of the chat template to fallback + # to if the tokenizer does not have a chat template else default to tokenizer. E.g. + # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat + # template. The custom jinja template should be provided in the chat_template_jinja + # field. + chat_template: ChatTemplate | str | None + # Custom jinja chat template. Used only if `chat_template: jinja` or empty. + chat_template_jinja: str | None + # path to source data files + data_files: str | list[str] | None + input_format: str | None + # name of dataset configuration to load + name: str | None + # defines the datatype when path is a file + ds_type: str | None + field: str | None + field_human: str | None + field_model: str | None + # Key containing the messages (default: "messages") + field_messages: str | None + # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON + # schema](https://json-schema.org/learn/getting-started-step-by-step). + field_tools: str | None + + message_field_role: str | None + + message_field_content: str | None + # Mapping of properties from the input dataset to the chat template. (default: + # message_property_mappings={'role':'role', 'content':'content'}) If a property exists + # in the template but not in this mapping, the system will attempt to load it directly + # from the message using the property name as the key. Example: In the mapping below, + # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and + # used as 'content' in the chat template. + message_property_mappings: dict[str, str] | None + # The key in the message turn that indicates via boolean whether tokens of a turn + # should be considered for training. Useful to selectively train on certain turns + # besides the `roles_to_train`. + message_field_training: str | None + # The key in the message turn that contains the training details. Useful to + # selectively train on certain tokens in a turn. The value of the key is a List[Dict] + # containing `begin_offset` (start character index in content), `end_offset` (end + # character index in content), and `train` (boolean whether to train). + message_field_training_detail: str | None + # (for Qwen3 template only) Whether to split the assistant content based on a + # reasoning trace inside delimited tags + split_thinking: bool | None + logprobs_field: str | None + temperature: float | None + # Roles to train on. The tokens from these roles will be considered for the loss. + roles_to_train: list[str] | None + # Which EOS tokens to train on in the conversation. Possible values are: all: train on + # all EOS tokens, turn (default): train on the EOS token at the end of each trainable + # turn, last: train on the last EOS token in the conversation + train_on_eos: Literal['all', 'turn', 'last'] | None + # Roles mapping in the messages. The format is {target_role: [source_roles]}. All + # source roles will be mapped to the target role. The default is: user: ["human", + # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"] + roles: dict[str, list[str]] | None + # Whether to drop the system turn from the dataset. Only works with chat_template. + # This does not drop the default system message from chat_template if it exists. If + # you wish to, we recommend using a custom jinja template with the default system + # message removed or adding a system turn with empty content. + drop_system_message: bool | None + # Trust remote code for untrusted source + trust_remote_code: bool | None = False + # The specific revision of the dataset to use when loading from the Hugging Face Hub. + # This can be a commit hash, tag, or branch name. If not specified, the latest version + # will be used. This parameter is ignored for local datasets. + revision: str | None + + # For DPODataset: + path: str | None + split: str | None + type: UserDefinedDPOType | str | None + # For UserDefinedDPOType: + field_system: str | None + field_prompt: str | None + field_chosen: str | None + field_rejected: str | None + prompt_format: str | None + chosen_format: str | None + rejected_format: str | None + data_files: list[str] | None + revision: str | None + field_messages: str | None + + # For KTODataset: + path: str | None + split: str | None + type: UserDefinedKTOType | str | None + # For UserDefinedKTOType: + field_system: str | None + field_prompt: str | None + field_completion: str | None + field_label: bool | None + prompt_format: str | None + completion_format: str | None + data_files: list[str] | None + trust_remote_code: bool | None = False + revision: str | None + + # For StepwiseSupervisedDataset: + path: str | None + split: str | None + data_files: list[str] | None + revision: str | None + step_separator: str | None + max_completion_length: int | None + train_on_last_step_only: bool | None + +# If false, the datasets will not be shuffled and will keep their original order in +# `datasets`. The same applies to the `test_datasets` option and the +# `pretraining_dataset` option. Default is true. +shuffle_merged_datasets: bool | None = True +# Axolotl attempts to save the dataset as an arrow after packing the data together so +# subsequent training attempts load faster, relative path +dataset_prepared_path: str | None +# Num shards for whole dataset +dataset_shard_num: int | None +# Index of shard to use for whole dataset +dataset_shard_idx: int | None +skip_prepare_dataset: bool | None = False + +# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize +pretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None + # For PretrainingDataset: + name: str | None + path: str | None + split: str | None = train + text_column: str | None = text + type: str | None = pretrain + trust_remote_code: bool | None = False + data_files: str | None + skip: int | None + + # For SFTDataset: + # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory + path: str | None + # name of dataset split to load from + split: str | None + # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] + type: str | UserDefinedPrompterType | None + # For UserDefinedPrompterType: + # Custom user instruction prompt + system_prompt: str | None + # Use {system} as key to be replaced + system_format: str | None + field_system: str | None + field_instruction: str | None + field_input: str | None + field_output: str | None + + # Customizable to be single line or multi-line. Use {instruction}/{input} as key to + # be replaced. 'format' can include {input} + format: str | None + # 'no_input_format' cannot include {input} + no_input_format: str | None + # For `completion` datsets only, uses the provided field instead of `text` column + field: str | None + input_transform: str | None + # split dataset into N pieces (use with shards_idx) + shards: int | None + # the index of sharded dataset to use + shards_idx: int | None + # process dataset in N sequential chunks for memory efficiency (exclusive with + # `shards`) + preprocess_shards: int | None + conversation: str | None + + # The name of the chat template to use for training, following values are supported: + # tokenizer_default: Uses the chat template that is available in the + # tokenizer_config.json. If the chat template is not available in the tokenizer, it + # will raise an error. This is the default. + # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates + # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. + # tokenizer_default_fallback_*: where * is the name of the chat template to fallback + # to if the tokenizer does not have a chat template else default to tokenizer. E.g. + # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat + # template. The custom jinja template should be provided in the chat_template_jinja + # field. + chat_template: ChatTemplate | str | None + # Custom jinja chat template. Used only if `chat_template: jinja` or empty. + chat_template_jinja: str | None + # path to source data files + data_files: str | list[str] | None + input_format: str | None + # name of dataset configuration to load + name: str | None + # defines the datatype when path is a file + ds_type: str | None + field: str | None + field_human: str | None + field_model: str | None + # Key containing the messages (default: "messages") + field_messages: str | None + # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON + # schema](https://json-schema.org/learn/getting-started-step-by-step). + field_tools: str | None + + message_field_role: str | None + + message_field_content: str | None + # Mapping of properties from the input dataset to the chat template. (default: + # message_property_mappings={'role':'role', 'content':'content'}) If a property exists + # in the template but not in this mapping, the system will attempt to load it directly + # from the message using the property name as the key. Example: In the mapping below, + # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and + # used as 'content' in the chat template. + message_property_mappings: dict[str, str] | None + # The key in the message turn that indicates via boolean whether tokens of a turn + # should be considered for training. Useful to selectively train on certain turns + # besides the `roles_to_train`. + message_field_training: str | None + # The key in the message turn that contains the training details. Useful to + # selectively train on certain tokens in a turn. The value of the key is a List[Dict] + # containing `begin_offset` (start character index in content), `end_offset` (end + # character index in content), and `train` (boolean whether to train). + message_field_training_detail: str | None + # (for Qwen3 template only) Whether to split the assistant content based on a + # reasoning trace inside delimited tags + split_thinking: bool | None + logprobs_field: str | None + temperature: float | None + # Roles to train on. The tokens from these roles will be considered for the loss. + roles_to_train: list[str] | None + # Which EOS tokens to train on in the conversation. Possible values are: all: train on + # all EOS tokens, turn (default): train on the EOS token at the end of each trainable + # turn, last: train on the last EOS token in the conversation + train_on_eos: Literal['all', 'turn', 'last'] | None + # Roles mapping in the messages. The format is {target_role: [source_roles]}. All + # source roles will be mapped to the target role. The default is: user: ["human", + # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"] + roles: dict[str, list[str]] | None + # Whether to drop the system turn from the dataset. Only works with chat_template. + # This does not drop the default system message from chat_template if it exists. If + # you wish to, we recommend using a custom jinja template with the default system + # message removed or adding a system turn with empty content. + drop_system_message: bool | None + # Trust remote code for untrusted source + trust_remote_code: bool | None = False + # The specific revision of the dataset to use when loading from the Hugging Face Hub. + # This can be a commit hash, tag, or branch name. If not specified, the latest version + # will be used. This parameter is ignored for local datasets. + revision: str | None + +# The maximum number of processes to use while preprocessing your input dataset. This +# defaults to `os.cpu_count()` if not set. +dataset_processes: int | None = 4 +# Deduplicates datasets and test_datasets with identical entries +dataset_exact_deduplication: bool | None +# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking +# too much storage +dataset_keep_in_memory: bool | None +dataloader_pin_memory: bool | None +dataloader_num_workers: int | None +dataloader_prefetch_factor: int | None +dataloader_drop_last: bool | None + +accelerator_config: dict[str, Any] | None + +remove_unused_columns: bool | None + +# Push prepared dataset to hub - repo_org/repo_name +push_dataset_to_hub: str | None +# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private +# datasets. Required to be true when used in combination with `push_dataset_to_hub` +hf_use_auth_token: bool | None + +device: Any | None +# Passed through to transformers when loading the model when launched without +# accelerate. Use `sequential` when training w/ model parallelism to limit memory +device_map: Any | None +world_size: int | None +# Don't mess with this, it's here for accelerate and torchrun +local_rank: int | None +ddp: bool | None + +# Seed for reproducibility +seed: int | None +# Advanced DDP Arguments - timeout +ddp_timeout: int | None +# Advanced DDP Arguments - bucket cap in MB +ddp_bucket_cap_mb: int | None +# Advanced DDP Arguments - broadcast buffers +ddp_broadcast_buffers: bool | None +ddp_find_unused_parameters: bool | None + +# Approximate number of predictions sent to wandb depending on batch size. Enabled above +# 0. Default is 0 +eval_table_size: int | None +# Total number of tokens generated for predictions sent to wandb. Default is 128 +eval_max_new_tokens: int | None +# Whether to run causal language model evaluation for metrics in +# `eval_causal_lm_metrics` +do_causal_lm_eval: bool | None +# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', +# 'chrf', 'perplexity'] +eval_causal_lm_metrics: list[str] | None +do_bench_eval: bool | None +bench_dataset: str | None +bench_split: str | None +metric_for_best_model: str | None +greater_is_better: bool | None + +# High loss value, indicating the learning has broken down (a good estimate is ~2 times +# the loss at the start of training) +loss_watchdog_threshold: float | None +# Number of high-loss steps in a row before the trainer aborts (default: 3) +loss_watchdog_patience: int | None + +gc_steps: int | None + +# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. +# require >=ampere +bf16: Literal['auto'] | bool | None = auto +# Use CUDA fp16 +fp16: bool | None +fp8: bool | None +# No AMP (automatic mixed precision) - require >=ampere +bfloat16: bool | None +# No AMP (automatic mixed precision) +float16: bool | None +# Use CUDA tf32 - require >=ampere +tf32: bool | None +float32: bool | None + +# Whether to use gradient checkpointing. Available options are: true, false, 'offload', +# 'offload_disk'. +# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing +gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False +# Additional kwargs to pass to the trainer for gradient checkpointing +gradient_checkpointing_kwargs: dict[str, Any] | None + +unfrozen_parameters: list[str] | None + +# The maximum length of an input to train with, this should typically be less than 2048 +# as most models have a token/context limit of 2048 +sequence_len: int = 512 +# The maximum length of an input for evaluation. If not specified, defaults to +# sequence_len +eval_sequence_len: int | None +min_sample_len: int | None +# maximum prompt length for RL training +max_prompt_len: int = 512 +# Use efficient multi-packing with block diagonal attention and per sequence +# position_ids. Recommend set to 'true' +sample_packing: bool | None +# The number of samples packed at a time. Increasing the following values helps with +# packing, but usually only slightly (<%1.) +sample_packing_group_size: int | None = 100000 +# The number of samples which can be packed into one sequence. Increase if using a large +# sequence_len with many short samples. +sample_packing_bin_size: int | None = 200 +# Whether to pack samples sequentially +sample_packing_sequentially: bool | None +# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or +# 'forkserver' +sample_packing_mp_start_method: str | None +# Set to 'false' if getting errors during eval with sample_packing on +eval_sample_packing: bool | None +# Pad inputs so each step uses constant sized buffers. This will reduce memory +# fragmentation and may prevent OOMs, by re-using memory more efficiently +pad_to_sequence_len: bool | None +# Whether to use sequential sampling for curriculum learning +curriculum_sampling: bool | None +multipack_real_batches: bool | None +# whether to concatenate samples during pretraining +pretraining_sample_concatenation: bool | None + +# Use batch flattening for speedups when not using sample_packing +batch_flattening: Literal['auto'] | bool | None + +use_pose: bool | None +pose_split_on_token_ids: list[int] | None +pose_max_context_len: int | None +pose_num_chunks: int | None + +pretrain_multipack_buffer_size: int | None = 10000 +# whether to prevent cross attention for packed sequences during pretraining +pretrain_multipack_attn: bool | None = True + +# Whether to use xformers attention patch https://github.com/facebookresearch/xformers +xformers_attention: bool | None +# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/ +# torch.nn.functional.scaled_dot_product_attention.html +sdp_attention: bool | None +# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf +s2_attention: bool | None +flex_attention: bool | None +flex_attn_compile_kwargs: dict[str, Any] | None +# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention +flash_attention: bool | None +# Whether to use flash-attention cross entropy implementation - advanced use only +flash_attn_cross_entropy: bool | None +# Whether to use flash-attention rms norm implementation - advanced use only +flash_attn_rms_norm: bool | None +# Whether to fuse QKV into a single operation +flash_attn_fuse_qkv: bool | None +# Whether to fuse part of the MLP into a single operation +flash_attn_fuse_mlp: bool | None +# Whether to use bettertransformers +flash_optimum: bool | None + +eager_attention: bool | None + +unsloth_cross_entropy_loss: bool | None +unsloth_lora_mlp: bool | None +unsloth_lora_qkv: bool | None +unsloth_lora_o: bool | None +unsloth_rms_norm: bool | None +unsloth_rope: bool | None + +# Apply custom LoRA autograd functions and activation function Triton kernels for speed +# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html +lora_mlp_kernel: bool | None +# Apply custom LoRA autograd functions and activation function Triton kernels for speed +# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html +lora_qkv_kernel: bool | None +# Apply custom LoRA autograd functions and activation function Triton kernels for speed +# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html +lora_o_kernel: bool | None + +# Whether to use chunked cross entropy loss for memory efficiency +chunked_cross_entropy: bool | None +# Number of chunks to use for chunked cross entropy loss +chunked_cross_entropy_num_chunks: int | None + +llama4_linearized_experts: bool | None + +# Deepspeed config path. e.g., deepspeed_configs/zero3.json +deepspeed: str | dict[str, Any] | None +# FSDP configuration +fsdp: list[str] | None +# FSDP configuration options +fsdp_config: dict[str, Any] | None +fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None + +# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for +# no eval. +val_set_size: float | None = 0.0 + +# Set to a divisor of the number of GPUs available to split sequences into chunks of +# equal size. Use in long context training to prevent OOM when sequences cannot fit into +# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each +# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized +# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more +# details. +sequence_parallel_degree: int | None +# Optional; strides across the key dimension. Larger values use more memory but should +# make training faster. Must evenly divide the number of KV heads in your model. +heads_k_stride: int | None +# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to +# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing +# case. +ring_attn_func: RingAttnFunc | None + +# Add or change special tokens. If you add tokens here, you don't need to add them to +# the `tokens` list. +special_tokens: SpecialTokensConfig | None + # For SpecialTokensConfig: + bos_token: str | None + eos_token: str | None + pad_token: str | None + unk_token: str | None + additional_special_tokens: list[str] | None + +# Add extra tokens to the tokenizer +tokens: list[str] | None +# Mapping token_id to new_token_string to override reserved added_tokens in the +# tokenizer. Only works for tokens that are not part of the base vocab (aka are +# added_tokens). Can be checked if they exist in tokenizer.json added_tokens. +added_tokens_overrides: dict[int, str] | None + +# Whether to use torch.compile and which backend to use. setting to `auto` will enable +# torch compile when torch>=2.5.1 +torch_compile: Literal['auto'] | bool | None +# Backend to use for torch.compile +torch_compile_backend: str | None +torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None + +# Maximum number of iterations to train for. It precedes num_epochs which means that if +# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => +# `num_epochs: 2` and `max_steps: 100` will train for 100 steps +max_steps: int | None +# Number of warmup steps. Cannot use with warmup_ratio +warmup_steps: int | None +# Warmup ratio. Cannot use with warmup_steps +warmup_ratio: float | None +# Leave empty to eval at each epoch, integer for every N steps. float for fraction of +# total steps +eval_steps: int | float | None +# Number of times per epoch to run evals, mutually exclusive with eval_steps +evals_per_epoch: int | None +# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer +# from `eval_steps` +eval_strategy: str | None +# Leave empty to save at each epoch, integer for every N steps. float for fraction of +# total steps +save_steps: int | float | None +# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps +saves_per_epoch: int | None +# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better +# result is achieved, leave empty to infer from `save_steps` +save_strategy: str | None +# Checkpoints saved at a time +save_total_limit: int | None +# Logging frequency +logging_steps: int | None +# Stop training after this many evaluation losses have increased in a row. https://huggi +# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin +# gCallback +early_stopping_patience: int | None +load_best_model_at_end: bool | None = False +# Save only the model weights, skipping the optimizer. Using this means you can't resume +# from checkpoints. +save_only_model: bool | None = False +# Use tensorboard for logging +use_tensorboard: bool | None +# Enable the pytorch profiler to capture the first N steps of training to the +# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more +# information. Snapshots can be visualized @ https://pytorch.org/memory_viz +profiler_steps: int | None +# bool of whether to include tokens trainer per second in the training metrics. This +# iterates over the entire dataset once, so it takes some time. +include_tokens_per_second: bool | None + +# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to +# add noise to embeddings. Currently only supported on Llama and Mistral +neftune_noise_alpha: float | None + +# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to +# `beta` in `ORPOConfig` due to trl mapping. +orpo_alpha: float | None +# Weighting of NLL term in loss from RPO paper +rpo_alpha: float | None +# Target reward margin for the SimPO loss +simpo_gamma: float | None +# Weight of the BC regularizer +cpo_alpha: float | None + +# Factor for desirable loss term in KTO loss +kto_desirable_weight: float | None +# Factor for undesirable loss term in KTO loss +kto_undesirable_weight: float | None +# The beta parameter for the RL training +rl_beta: float | None + +# Defines the max memory usage per gpu on the system. Passed through to transformers +# when loading the model. +max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None +# Limit the memory for all available GPUs to this amount (if an integer, expressed in +# gigabytes); default: unset +gpu_memory_limit: int | str | None +# Whether to use low_cpu_mem_usage +low_cpu_mem_usage: bool | None + +# The name of the chat template to use for training, following values are supported: +# tokenizer_default: Uses the chat template that is available in the +# tokenizer_config.json. If the chat template is not available in the tokenizer, it will +# raise an error. This is the default value. +# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates +# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. +# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. +# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not +# available in the tokenizer. jinja: Uses a custom jinja template for the chat template. +# The custom jinja template should be provided in the chat_template_jinja field. The +# selected chat template will be saved to the tokenizer_config.json for easier +# inferencing +chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None +# Custom jinja template for chat template. This will be only used if chat_template is +# set to `jinja` or `null` (in which case chat_template is automatically set to +# `jinja`). Default is null. +chat_template_jinja: str | None +# Additional kwargs to pass to the chat template. This is useful for customizing the +# chat template. For example, you can pass `thinking=False` to add a generation prompt +# to the chat template. +chat_template_kwargs: dict[str, Any] | None +# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the +# boundaries between conversation turns. For example: ['/INST', '</s>', +# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is +# useful for templates that use multiple delimiter tokens. +eot_tokens: list[str] | None +# Changes the default system message. Currently only supports chatml. +default_system_message: str | None + +fix_untrained_tokens: int | list[int] | None + +is_preprocess: bool | None +preprocess_iterable: bool | None + +# Total number of tokens - internal use +total_num_tokens: int | None +total_supervised_tokens: int | None +# You can set these packing optimizations AFTER starting a training at least once. The +# trainer will provide recommended values for these values. +sample_packing_eff_est: float | None +axolotl_config_path: str | None + +# Internal use only - Used to identify which the model is based on +is_falcon_derived_model: bool | None +# Internal use only - Used to identify which the model is based on +is_llama_derived_model: bool | None +# Internal use only - Used to identify which the model is based on. Please note that if +# you set this to true, `padding_side` will be set to 'left' by default +is_mistral_derived_model: bool | None +# Internal use only - Used to identify which the model is based on +is_qwen_derived_model: bool | None + +# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available +# plugins or doc below for more details. +# https://docs.axolotl.ai/docs/custom_integrations.html +plugins: list[str] | None + +# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This +# can also be a relative path to a model on disk +base_model: str (required) +# If the base_model repo on hf hub doesn't include configuration .json files, You can +# set that here, or leave this empty to default to base_model +base_model_config: str | None +cls_model_config: str | None +# Optional tokenizer configuration path in case you want to use a different tokenizer +# than the one defined in the base model +tokenizer_config: str | None +# use_fast option for tokenizer loading from_pretrained, default to True +tokenizer_use_fast: bool | None +# Whether to use the legacy tokenizer setting, defaults to True +tokenizer_legacy: bool | None +# Whether to use mistral-common tokenizer. If set to True, it will use the mistral- +# common tokenizer. +tokenizer_use_mistral_common: bool | None +# Corresponding tokenizer for the model AutoTokenizer is a good choice +tokenizer_type: str | None +# transformers processor class +processor_type: str | None +# Trust remote code for untrusted source +trust_remote_code: bool | None + +# Where to save the full-finetuned model to +output_dir: str = ./model-out +# push checkpoints to hub +hub_model_id: str | None +# how to push checkpoints to hub +hub_strategy: str | None +# Save model as safetensors (require safetensors package). Default True +save_safetensors: bool | None = True + +# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer +load_in_8bit: bool | None = False +# Use bitsandbytes 4 bit +load_in_4bit: bool | None = False + +# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in +# original model +adapter: str | None +# If you already have a lora model trained that you want to load, put that here. This +# means after training, if you want to test the model, you should set this to the value +# of `output_dir`. Note that if you merge an adapter to the base model, a new +# subdirectory `merged` will be created under the `output_dir`. +lora_model_dir: str | None +lora_r: int | None +lora_alpha: int | None +lora_fan_in_fan_out: bool | None +lora_target_modules: str | list[str] | None +# If true, will target all linear modules +lora_target_linear: bool | None +# If you added new tokens to the tokenizer, you may need to save some LoRA modules +# because they need to know the new tokens. For LLaMA and Mistral, you need to save +# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts +# tokens to embeddings, and `lm_head` converts embeddings to token probabilities. +lora_modules_to_save: list[str] | None +lora_dropout: float | None = 0.0 +# The layer indices to transform, otherwise, apply to all layers +peft_layers_to_transform: list[int] | None +peft_layers_pattern: list[str] | None + +peft: PeftConfig | None + # For PeftConfig: + # Configuration options for loftq initialization for LoRA + loftq_config: LoftQConfig | None + # For LoftQConfig: + # typically 4 bits + loftq_bits: int = 4 + +# Whether to use DoRA. +peft_use_dora: bool | None +# Whether to use RSLoRA. +peft_use_rslora: bool | None +# List of layer indices to replicate. +peft_layer_replication: list[tuple[int, int]] | None +# How to initialize LoRA weights. Default to True which is MS original implementation. +peft_init_lora_weights: bool | str | None + +# load qlora model in sharded format for FSDP using answer.ai technique. +qlora_sharded_model_loading: bool | None = False +# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it +# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge +lora_on_cpu: bool | None +# Whether you are training a 4-bit GPTQ quantized model +gptq: bool | None +# optional overrides to the bnb 4bit quantization configuration +bnb_config_kwargs: dict[str, Any] | None + +# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4. +loraplus_lr_ratio: float | None +# loraplus learning rate for lora embedding layers. Default value is 1e-6. +loraplus_lr_embedding: float | None = 1e-06 + +merge_lora: bool | None + +# Number of steps per ReLoRA restart +relora_steps: int | None +# Number of per-restart warmup steps +relora_warmup_steps: int | None +# Number of anneal steps for each relora cycle +relora_anneal_steps: int | None +# threshold for optimizer magnitude when pruning +relora_prune_ratio: float | None +# True to perform lora weight merges on cpu during restarts, for modest gpu memory +# savings +relora_cpu_offload: bool | None + +# If greater than 1, backpropagation will be skipped and the gradients will be +# accumulated for the given number of steps. +gradient_accumulation_steps: int | None = 1 +# The number of samples to include in each batch. This is the number of samples sent to +# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps +micro_batch_size: int | None = 1 +# Total batch size, we do not recommended setting this manually +batch_size: int | None +# per gpu micro batch size for evals, defaults to value of micro_batch_size +eval_batch_size: int | None + +# whether to find batch size that fits in memory. Passed to underlying transformers +# Trainer +auto_find_batch_size: bool | None + +# Whether to mask out or include the human's prompt from the training labels +train_on_inputs: bool | None = False +# Group similarly sized data to minimize padding. May be slower to start, as it must +# download and sort the entire dataset. Note that training loss may have an oscillating +# pattern with this enabled. +group_by_length: bool | None + +learning_rate: str | float (required) +embedding_lr: float | None +embedding_lr_scale: float | None +# Specify weight decay +weight_decay: float | None = 0.0 +# Specify optimizer +optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED +# Dictionary of arguments to pass to the optimizer +optim_args: str | dict[str, Any] | None +# The target modules to optimize, i.e. the module names that you would like to train, +# right now this is used only for GaLore algorithm +optim_target_modules: list[str] | Literal['all_linear'] | None +# Path to torch distx for optim 'adamw_anyprecision' +torchdistx_path: str | None +lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE +# Specify a scheduler and kwargs to use with the optimizer +lr_scheduler_kwargs: dict[str, Any] | None +lr_quadratic_warmup: bool | None +# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of +# peak lr +cosine_min_lr_ratio: float | None +# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means +# start cosine_min_lr at 80% of training step +cosine_constant_lr_ratio: float | None +# Learning rate div factor +lr_div_factor: float | None + +lr_groups: list[LrGroup] | None + # For LrGroup: + name: str (required) + modules: list[str] (required) + lr: float (required) + +# adamw hyperparams +adam_epsilon: float | None +# only used for CAME Optimizer +adam_epsilon2: float | None +# adamw hyperparams +adam_beta1: float | None +# adamw hyperparams +adam_beta2: float | None +# only used for CAME Optimizer +adam_beta3: float | None +# Gradient clipping max norm +max_grad_norm: float | None +num_epochs: float = 1.0 + +use_wandb: bool | None +# Set the name of your wandb run +wandb_name: str | None +# Set the ID of your wandb run +wandb_run_id: str | None +# "offline" to save run metadata locally and not sync to the server, "disabled" to turn +# off wandb +wandb_mode: str | None +# Your wandb project name +wandb_project: str | None +# A wandb Team name if using a Team +wandb_entity: str | None +wandb_watch: str | None +# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only +# at the end of training +wandb_log_model: str | None + +use_mlflow: bool | None +# URI to mlflow +mlflow_tracking_uri: str | None +# Your experiment name +mlflow_experiment_name: str | None +# Your run name +mlflow_run_name: str | None +# set to true to copy each saved checkpoint on each save to mlflow artifact registry +hf_mlflow_log_artifacts: bool | None + +# Enable or disable Comet integration. +use_comet: bool | None +# API key for Comet. Recommended to set via `comet login`. +comet_api_key: str | None +# Workspace name in Comet. Defaults to the user's default workspace. +comet_workspace: str | None +# Project name in Comet. Defaults to Uncategorized. +comet_project_name: str | None +# Identifier for the experiment. Used to append data to an existing experiment or +# control the key of new experiments. Default to a random key. +comet_experiment_key: str | None +# Create a new experiment ("create") or log to an existing one ("get"). Default +# ("get_or_create") auto-selects based on configuration. +comet_mode: str | None +# Set to True to log data to Comet server, or False for offline storage. Default is +# True. +comet_online: bool | None +# Dictionary for additional configuration settings, see the doc for more details. +comet_experiment_config: dict[str, Any] | None + +# the number of activate layers in LISA +lisa_n_layers: int | None +# how often to switch layers in LISA +lisa_step_interval: int | None +# path under the model to access the layers +lisa_layers_attribute: str | None = model.layers + +gradio_title: str | None +gradio_share: bool | None +gradio_server_name: str | None +gradio_server_port: int | None +gradio_max_new_tokens: int | None +gradio_temperature: float | None + +use_ray: bool = False +ray_run_name: str | None +ray_num_workers: int = 1 +resources_per_worker: dict + +# The size of the image to resize to. It can be an integer (resized into padded-square +# image) or a tuple (width, height).If not provided, we will attempt to load from +# preprocessor.size, otherwise, images won't be resized. +image_size: int | tuple[int, int] | None +# The resampling algorithm to use for image resizing. Default is bilinear. Please refer +# to PIL.Image.Resampling for more details. +image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None + +# optional overrides to the base model configuration +overrides_of_model_config: dict[str, Any] | None +# optional overrides the base model loading from_pretrained +overrides_of_model_kwargs: dict[str, Any] | None +# If you want to specify the type of model to load, AutoModelForCausalLM is a good +# choice too +type_of_model: str | None +# You can specify to choose a specific model revision from huggingface hub +revision_of_model: str | None + +max_packed_sequence_len: int | None +rope_scaling: Any | None +noisy_embedding_alpha: float | None +dpo_beta: float | None +evaluation_strategy: str | None diff --git a/docs/docker.html b/docs/docker.html index e5c877f9c..261411629 100644 --- a/docs/docker.html +++ b/docs/docker.html @@ -525,7 +525,7 @@ Important
-

For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.

+

For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.

@@ -543,6 +543,7 @@ Important @@ -584,13 +585,15 @@ Tip

Tags examples:

diff --git a/search.json b/search.json index 9b6060eb8..66d475c93 100644 --- a/search.json +++ b/search.json @@ -169,7 +169,7 @@ "href": "docs/docker.html#base", "title": "Docker", "section": "Base", - "text": "Base\nThe base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.\n\nImage\naxolotlai/axolotl-base\nLink: Docker Hub\n\n\nTags format\nmain-base-py{python_version}-cu{cuda_version}-{pytorch_version}\nTags examples:\n\nmain-base-py3.11-cu128-2.7.1\nmain-base-py3.11-cu126-2.7.1\nmain-base-py3.11-cu124-2.6.0\nmain-base-py3.11-cu124-2.5.1", + "text": "Base\nThe base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.\n\nImage\naxolotlai/axolotl-base\nLink: Docker Hub\n\n\nTags format\nmain-base-py{python_version}-cu{cuda_version}-{pytorch_version}\nTags examples:\n\nmain-base-py3.11-cu128-2.7.1\nmain-base-py3.11-cu126-2.7.1\nmain-base-py3.11-cu126-2.6.0\nmain-base-py3.11-cu124-2.6.0\nmain-base-py3.11-cu124-2.5.1", "crumbs": [ "Deployments", "Docker" @@ -180,7 +180,7 @@ "href": "docs/docker.html#main", "title": "Docker", "section": "Main", - "text": "Main\nThe main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.\n\nImage\naxolotlai/axolotl\nLink: Docker Hub\n\n\nTags format\n# on push to main\nmain-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)\nmain-latest\n\n# nightly build\n{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# tagged release\n{version}\n\n\n\n\n\n\nTip\n\n\n\nThere may be some extra tags appended to the image, like -vllm which installs those packages.\n\n\nTags examples:\n\nmain-py3.11-cu126-2.7.0\nmain-py3.11-cu124-2.6.0\nmain-py3.11-cu124-2.5.1\nmain-latest\nmain-20250303-py3.11-cu124-2.6.0\nmain-20250303-py3.11-cu124-2.5.1\n0.9.2", + "text": "Main\nThe main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.\n\nImage\naxolotlai/axolotl\nLink: Docker Hub\n\n\nTags format\n# on push to main\nmain-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)\nmain-latest\n\n# nightly build\n{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# tagged release\n{version}\n\n\n\n\n\n\nTip\n\n\n\nThere may be some extra tags appended to the image, like -vllm which installs those packages.\n\n\nTags examples:\n\nmain-py3.11-cu128-2.7.1\nmain-py3.11-cu126-2.7.1\nmain-py3.11-cu126-2.6.0\nmain-py3.11-cu124-2.6.0\nmain-py3.11-cu124-2.5.1\nmain-latest\nmain-20250303-py3.11-cu124-2.6.0\nmain-20250303-py3.11-cu124-2.5.1\n0.10.1", "crumbs": [ "Deployments", "Docker" @@ -367,7 +367,7 @@ "href": "docs/config-reference.html", "title": "Config Reference", "section": "", - "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl: RLType | None\n\ntrl: TRLConfig | None\n # For TRLConfig:\n # Beta parameter for the RL training. Same as `rl_beta`. Use\n beta: float | None\n # Maximum length of the completion for RL training.\n max_completion_length: int | None\n\n # Whether to use VLLM for RL training.\n use_vllm: bool = False\n # Host of the vLLM server to connect to.\n vllm_server_host: str | None = 0.0.0.0\n # Port of the vLLM server to connect to.\n vllm_server_port: int | None = 8000\n # Total timeout (in seconds) to wait for the vLLM server to respond.\n vllm_server_timeout: int | None\n # Regex for vLLM guided decoding.\n vllm_guided_decoding_regex: str | None\n\n # List of reward functions to load. Paths must be importable from current dir.\n reward_funcs: list[str] | None\n # List of reward weights for the reward functions.\n reward_weights: list[float] | None\n # Number of generations to sample.\n num_generations: int | None\n # Whether to log completions.\n log_completions: bool | None = False\n # Number of completions to print when log_completions is True.\n num_completions_to_print: int | None\n # Whether to sync the reference model.\n sync_ref_model: bool | None = False\n # Mixup alpha for the reference model.\n ref_model_mixup_alpha: float | None = 0.9\n # Sync steps for the reference model.\n ref_model_sync_steps: int | None = 64\n # Whether to scale rewards by their standard deviation.\n scale_rewards: bool = True\n\n # Sampling temperature for the GRPO policy.\n temperature: float | None\n # Top-p sampling probability for the generation policy.\n top_p: float | None\n # Top-k sampling for the generation policy.\n top_k: int | None\n # Minimum probability for the generation policy.\n min_p: float | None\n # Penalty for tokens that appear in prompt and generated text.\n repetition_penalty: float | None\n # Number of iterations per batch (μ) for GRPO.\n num_iterations: int | None\n # Epsilon value for clipping in the GRPO algorithm.\n epsilon: float | None\n # Upper-bound epsilon value for clipping in the GRPO algorithm.\n epsilon_high: float | None\n # Whether to use Liger loss for GRPO.\n use_liger_loss: bool | None\n # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n loss_type: str | None\n # Whether to exclude truncated completions from loss calculation.\n mask_truncated_completions: bool = False\n\nvllm: VllmConfig | None\n # For VllmConfig:\n # Device to use for VLLM\n device: str | None = auto\n # Tensor parallel size for VLLM\n tensor_parallel_size: int | None\n # GPU memory utilization for VLLM\n gpu_memory_utilization: float | None = 0.9\n # Data type for VLLM\n dtype: str | None = auto\n # Maximum length of the model context for VLLM\n max_model_len: int | None\n # Enable prefix caching for VLLM\n enable_prefix_caching: bool | None\n # Host for the vLLM server to start on\n host: str | None = 0.0.0.0\n # Port of the vLLM server to start on\n port: int | None = 8000\n\n # Enable reasoning for VLLM\n enable_reasoning: bool | None\n # Reasoning parser for VLLM\n reasoning_parser: str | None\n\nqat: QATConfig | None\n # For QATConfig:\n # Fake quantization layout to use for activation quantization. Valid options are\n # \"int4\" and \"int8\"\n activation_dtype: TorchIntDType | None\n # Fake quantization layout to use for weight quantization. Valid options are \"int4\"\n # and \"int8\"\n weight_dtype: TorchIntDType = TorchIntDType.int8\n # Quantize embedding\n quantize_embedding: bool | None = False\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n # The number of steps to apply fake quantization after\n fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n # For PTQConfig:\n # Fake quantization layout to use for weight quantization. Valid options are uintX for\n # X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8\n weight_dtype: TorchIntDType = TorchIntDType.int8\n # Fake quantization layout to use for activation quantization. Valid options are\n # \"int4\" and \"int8\"\n activation_dtype: TorchIntDType | None\n # Whether to quantize the embedding layer.\n quantize_embedding: bool | None\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_use_logits_to_keep: bool | None\ndpo_label_smoothing: float | None\ndpo_norm_loss: bool | None\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n # For `completion` datsets only, uses the provided field instead of `text` column\n field: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n # For `completion` datsets only, uses the provided field instead of `text` column\n field: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n # For PretrainingDataset:\n name: str | None\n path: str | None\n split: str | None = train\n text_column: str | None = text\n type: str | None = pretrain\n trust_remote_code: bool | None = False\n data_files: str | None\n skip: int | None\n\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n # For `completion` datsets only, uses the provided field instead of `text` column\n field: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set.\ndataset_processes: int | None = 4\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Approximate number of predictions sent to wandb depending on batch size. Enabled above\n# 0. Default is 0\neval_table_size: int | None\n# Total number of tokens generated for predictions sent to wandb. Default is 128\neval_max_new_tokens: int | None\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require >=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\nfp8: bool | None\n# No AMP (automatic mixed precision) - require >=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# Use CUDA tf32 - require >=ampere\ntf32: bool | None\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int = 512\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (<%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None = 10000\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse QKV into a single operation\nflash_attn_fuse_qkv: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n\neager_attention: bool | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# FSDP configuration\nfsdp: list[str] | None\n# FSDP configuration options\nfsdp_config: dict[str, Any] | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\nsequence_parallel_degree: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n # For SpecialTokensConfig:\n bos_token: str | None\n eos_token: str | None\n pad_token: str | None\n unk_token: str | None\n additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch>=2.5.1\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# bool of whether to include tokens trainer per second in the training metrics. This\n# iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: bool | None\n\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Weighting of NLL term in loss from RPO paper\nrpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template for chat template. This will be only used if chat_template is\n# set to `jinja` or `null` (in which case chat_template is automatically set to\n# `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '</s>',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# Save model as safetensors (require safetensors package). Default True\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in\n# original model\nadapter: str | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n # For PeftConfig:\n # Configuration options for loftq initialization for LoRA\n loftq_config: LoftQConfig | None\n # For LoftQConfig:\n # typically 4 bits\n loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n\n# Number of steps per ReLoRA restart\nrelora_steps: int | None\n# Number of per-restart warmup steps\nrelora_warmup_steps: int | None\n# Number of anneal steps for each relora cycle\nrelora_anneal_steps: int | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n # For LrGroup:\n name: str (required)\n modules: list[str] (required)\n lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None", + "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl: RLType | None\n\ntrl: TRLConfig | None\n # For TRLConfig:\n # Beta parameter for the RL training. Same as `rl_beta`. Use\n beta: float | None\n # Maximum length of the completion for RL training.\n max_completion_length: int | None\n\n # Whether to use VLLM for RL training.\n use_vllm: bool = False\n # Host of the vLLM server to connect to.\n vllm_server_host: str | None = 0.0.0.0\n # Port of the vLLM server to connect to.\n vllm_server_port: int | None = 8000\n # Total timeout (in seconds) to wait for the vLLM server to respond.\n vllm_server_timeout: int | None\n # Regex for vLLM guided decoding.\n vllm_guided_decoding_regex: str | None\n\n # List of reward functions to load. Paths must be importable from current dir.\n reward_funcs: list[str] | None\n # List of reward weights for the reward functions.\n reward_weights: list[float] | None\n # Number of generations to sample.\n num_generations: int | None\n # Whether to log completions.\n log_completions: bool | None = False\n # Number of completions to print when log_completions is True.\n num_completions_to_print: int | None\n # Whether to sync the reference model.\n sync_ref_model: bool | None = False\n # Mixup alpha for the reference model.\n ref_model_mixup_alpha: float | None = 0.9\n # Sync steps for the reference model.\n ref_model_sync_steps: int | None = 64\n # Whether to scale rewards by their standard deviation.\n scale_rewards: bool = True\n\n # Sampling temperature for the GRPO policy.\n temperature: float | None\n # Top-p sampling probability for the generation policy.\n top_p: float | None\n # Top-k sampling for the generation policy.\n top_k: int | None\n # Minimum probability for the generation policy.\n min_p: float | None\n # Penalty for tokens that appear in prompt and generated text.\n repetition_penalty: float | None\n # Number of iterations per batch (μ) for GRPO.\n num_iterations: int | None\n # Epsilon value for clipping in the GRPO algorithm.\n epsilon: float | None\n # Upper-bound epsilon value for clipping in the GRPO algorithm.\n epsilon_high: float | None\n # Whether to use Liger loss for GRPO.\n use_liger_loss: bool | None\n # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n loss_type: str | None\n # Whether to exclude truncated completions from loss calculation.\n mask_truncated_completions: bool = False\n\nvllm: VllmConfig | None\n # For VllmConfig:\n # Device to use for VLLM\n device: str | None = auto\n # Tensor parallel size for VLLM\n tensor_parallel_size: int | None\n # GPU memory utilization for VLLM\n gpu_memory_utilization: float | None = 0.9\n # Data type for VLLM\n dtype: str | None = auto\n # Maximum length of the model context for VLLM\n max_model_len: int | None\n # Enable prefix caching for VLLM\n enable_prefix_caching: bool | None\n # Host for the vLLM server to start on\n host: str | None = 0.0.0.0\n # Port of the vLLM server to start on\n port: int | None = 8000\n\n # Enable reasoning for VLLM\n enable_reasoning: bool | None\n # Reasoning parser for VLLM\n reasoning_parser: str | None\n\nqat: QATConfig | None\n # For QATConfig:\n # Fake quantization layout to use for activation quantization. Valid options are\n # \"int4\" and \"int8\"\n activation_dtype: TorchIntDType | None\n # Fake quantization layout to use for weight quantization. Valid options are \"int4\"\n # and \"int8\"\n weight_dtype: TorchIntDType = TorchIntDType.int8\n # Quantize embedding\n quantize_embedding: bool | None = False\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n # The number of steps to apply fake quantization after\n fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n # For PTQConfig:\n # Fake quantization layout to use for weight quantization. Valid options are uintX for\n # X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8\n weight_dtype: TorchIntDType = TorchIntDType.int8\n # Fake quantization layout to use for activation quantization. Valid options are\n # \"int4\" and \"int8\"\n activation_dtype: TorchIntDType | None\n # Whether to quantize the embedding layer.\n quantize_embedding: bool | None\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_use_logits_to_keep: bool | None\ndpo_label_smoothing: float | None\ndpo_norm_loss: bool | None\ndpo_padding_free: bool | None\ndpo_generate_during_eval: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n # For `completion` datsets only, uses the provided field instead of `text` column\n field: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n # For `completion` datsets only, uses the provided field instead of `text` column\n field: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n # For PretrainingDataset:\n name: str | None\n path: str | None\n split: str | None = train\n text_column: str | None = text\n type: str | None = pretrain\n trust_remote_code: bool | None = False\n data_files: str | None\n skip: int | None\n\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n # For `completion` datsets only, uses the provided field instead of `text` column\n field: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set.\ndataset_processes: int | None = 4\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Approximate number of predictions sent to wandb depending on batch size. Enabled above\n# 0. Default is 0\neval_table_size: int | None\n# Total number of tokens generated for predictions sent to wandb. Default is 128\neval_max_new_tokens: int | None\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require >=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\nfp8: bool | None\n# No AMP (automatic mixed precision) - require >=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# Use CUDA tf32 - require >=ampere\ntf32: bool | None\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int = 512\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (<%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None = 10000\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse QKV into a single operation\nflash_attn_fuse_qkv: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n\neager_attention: bool | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# FSDP configuration\nfsdp: list[str] | None\n# FSDP configuration options\nfsdp_config: dict[str, Any] | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\nsequence_parallel_degree: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n # For SpecialTokensConfig:\n bos_token: str | None\n eos_token: str | None\n pad_token: str | None\n unk_token: str | None\n additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch>=2.5.1\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# bool of whether to include tokens trainer per second in the training metrics. This\n# iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: bool | None\n\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Weighting of NLL term in loss from RPO paper\nrpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template for chat template. This will be only used if chat_template is\n# set to `jinja` or `null` (in which case chat_template is automatically set to\n# `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '</s>',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# Save model as safetensors (require safetensors package). Default True\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in\n# original model\nadapter: str | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n # For PeftConfig:\n # Configuration options for loftq initialization for LoRA\n loftq_config: LoftQConfig | None\n # For LoftQConfig:\n # typically 4 bits\n loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n\n# Number of steps per ReLoRA restart\nrelora_steps: int | None\n# Number of per-restart warmup steps\nrelora_warmup_steps: int | None\n# Number of anneal steps for each relora cycle\nrelora_anneal_steps: int | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n # For LrGroup:\n name: str (required)\n modules: list[str] (required)\n lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None", "crumbs": [ "Getting Started", "Config Reference" diff --git a/sitemap.xml b/sitemap.xml index a582c5120..c338508df 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,758 +2,758 @@ https://docs.axolotl.ai/docs/unsloth.html - 2025-07-02T07:04:01.016Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/dataset-formats/conversation.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset-formats/tokenized.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/mac.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/nccl.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/multi-node.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/docker.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/lr_groups.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/inference.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/cli.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/config-reference.html - 2025-07-02T07:07:38.121Z + 2025-07-02T12:09:33.017Z https://docs.axolotl.ai/docs/multi-gpu.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/debugging.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/multimodal.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/api/cli.sweeps.html - 2025-07-02T07:07:24.426Z + 2025-07-02T12:09:19.799Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html - 2025-07-02T07:07:24.748Z + 2025-07-02T12:09:20.124Z https://docs.axolotl.ai/docs/api/utils.schedulers.html - 2025-07-02T07:07:25.158Z + 2025-07-02T12:09:20.518Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html - 2025-07-02T07:07:24.945Z + 2025-07-02T12:09:20.322Z https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html - 2025-07-02T07:07:24.474Z + 2025-07-02T12:09:19.847Z https://docs.axolotl.ai/docs/api/kernels.geglu.html - 2025-07-02T07:07:24.899Z + 2025-07-02T12:09:20.275Z https://docs.axolotl.ai/docs/api/core.trainers.utils.html - 2025-07-02T07:07:24.545Z + 2025-07-02T12:09:19.919Z https://docs.axolotl.ai/docs/api/core.datasets.chat.html - 2025-07-02T07:07:24.279Z + 2025-07-02T12:09:19.650Z https://docs.axolotl.ai/docs/api/utils.schemas.peft.html - 2025-07-02T07:07:25.297Z + 2025-07-02T12:09:20.629Z https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html - 2025-07-02T07:07:25.008Z + 2025-07-02T12:09:20.386Z https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html - 2025-07-02T07:07:24.652Z + 2025-07-02T12:09:20.027Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html - 2025-07-02T07:07:24.780Z + 2025-07-02T12:09:20.157Z https://docs.axolotl.ai/docs/api/cli.cloud.base.html - 2025-07-02T07:07:24.467Z + 2025-07-02T12:09:19.841Z https://docs.axolotl.ai/docs/api/kernels.swiglu.html - 2025-07-02T07:07:24.909Z + 2025-07-02T12:09:20.286Z https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html - 2025-07-02T07:07:24.715Z + 2025-07-02T12:09:20.091Z https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html - 2025-07-02T07:07:24.804Z + 2025-07-02T12:09:20.181Z https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html - 2025-07-02T07:07:24.705Z + 2025-07-02T12:09:20.081Z https://docs.axolotl.ai/docs/api/kernels.utils.html - 2025-07-02T07:07:24.918Z + 2025-07-02T12:09:20.295Z https://docs.axolotl.ai/docs/api/common.datasets.html - 2025-07-02T07:07:25.507Z + 2025-07-02T12:09:20.842Z https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html - 2025-07-02T07:07:25.288Z + 2025-07-02T12:09:20.621Z https://docs.axolotl.ai/docs/api/core.builders.rl.html - 2025-07-02T07:07:24.234Z + 2025-07-02T12:09:19.605Z https://docs.axolotl.ai/docs/api/evaluate.html - 2025-07-02T07:07:24.142Z + 2025-07-02T12:09:19.510Z https://docs.axolotl.ai/docs/api/kernels.quantize.html - 2025-07-02T07:07:24.917Z + 2025-07-02T12:09:20.293Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html - 2025-07-02T07:07:24.944Z + 2025-07-02T12:09:20.320Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html - 2025-07-02T07:07:24.588Z + 2025-07-02T12:09:19.963Z https://docs.axolotl.ai/docs/api/integrations.base.html - 2025-07-02T07:07:25.468Z + 2025-07-02T12:09:20.802Z https://docs.axolotl.ai/docs/api/cli.merge_lora.html - 2025-07-02T07:07:24.400Z + 2025-07-02T12:09:19.773Z https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html - 2025-07-02T07:07:24.412Z + 2025-07-02T12:09:19.785Z https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html - 2025-07-02T07:07:25.025Z + 2025-07-02T12:09:20.403Z https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html - 2025-07-02T07:07:24.699Z + 2025-07-02T12:09:20.075Z https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html - 2025-07-02T07:07:25.538Z + 2025-07-02T12:09:20.871Z https://docs.axolotl.ai/docs/api/utils.data.sft.html - 2025-07-02T07:07:25.197Z + 2025-07-02T12:09:20.557Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html - 2025-07-02T07:07:24.667Z + 2025-07-02T12:09:20.042Z https://docs.axolotl.ai/docs/api/integrations.liger.args.html - 2025-07-02T07:07:25.483Z + 2025-07-02T12:09:20.817Z https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html - 2025-07-02T07:07:24.960Z + 2025-07-02T12:09:20.338Z https://docs.axolotl.ai/docs/api/cli.vllm_serve.html - 2025-07-02T07:07:24.464Z + 2025-07-02T12:09:19.838Z https://docs.axolotl.ai/docs/api/monkeypatch.utils.html - 2025-07-02T07:07:25.006Z + 2025-07-02T12:09:20.384Z https://docs.axolotl.ai/docs/api/loaders.patch_manager.html - 2025-07-02T07:07:24.578Z + 2025-07-02T12:09:19.953Z https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html - 2025-07-02T07:07:25.318Z + 2025-07-02T12:09:20.650Z https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html - 2025-07-02T07:07:25.585Z + 2025-07-02T12:09:20.918Z https://docs.axolotl.ai/docs/api/cli.utils.html - 2025-07-02T07:07:24.457Z + 2025-07-02T12:09:19.831Z https://docs.axolotl.ai/docs/api/utils.schemas.config.html - 2025-07-02T07:07:25.249Z + 2025-07-02T12:09:20.590Z https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html - 2025-07-02T07:07:24.711Z + 2025-07-02T12:09:20.087Z https://docs.axolotl.ai/docs/api/utils.distributed.html - 2025-07-02T07:07:25.178Z + 2025-07-02T12:09:20.537Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html - 2025-07-02T07:07:25.069Z + 2025-07-02T12:09:20.436Z https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html - 2025-07-02T07:07:25.018Z + 2025-07-02T12:09:20.396Z https://docs.axolotl.ai/docs/api/core.builders.base.html - 2025-07-02T07:07:24.222Z + 2025-07-02T12:09:19.593Z https://docs.axolotl.ai/docs/api/core.trainers.trl.html - 2025-07-02T07:07:24.505Z + 2025-07-02T12:09:19.879Z https://docs.axolotl.ai/docs/api/cli.evaluate.html - 2025-07-02T07:07:24.334Z + 2025-07-02T12:09:19.706Z https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html - 2025-07-02T07:07:25.189Z + 2025-07-02T12:09:20.548Z https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html - 2025-07-02T07:07:25.604Z + 2025-07-02T12:09:20.937Z https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html - 2025-07-02T07:07:24.521Z + 2025-07-02T12:09:19.895Z https://docs.axolotl.ai/docs/api/core.chat.format.shared.html - 2025-07-02T07:07:24.274Z + 2025-07-02T12:09:19.645Z https://docs.axolotl.ai/docs/api/monkeypatch.relora.html - 2025-07-02T07:07:24.969Z + 2025-07-02T12:09:20.346Z https://docs.axolotl.ai/docs/api/cli.config.html - 2025-07-02T07:07:24.378Z + 2025-07-02T12:09:19.750Z https://docs.axolotl.ai/docs/api/cli.preprocess.html - 2025-07-02T07:07:24.420Z + 2025-07-02T12:09:19.793Z https://docs.axolotl.ai/docs/api/core.trainers.base.html - 2025-07-02T07:07:24.489Z + 2025-07-02T12:09:19.863Z https://docs.axolotl.ai/docs/api/convert.html - 2025-07-02T07:07:24.165Z + 2025-07-02T12:09:19.534Z https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html - 2025-07-02T07:07:24.732Z + 2025-07-02T12:09:20.108Z https://docs.axolotl.ai/docs/api/utils.schemas.trl.html - 2025-07-02T07:07:25.300Z + 2025-07-02T12:09:20.633Z https://docs.axolotl.ai/docs/api/cli.args.html - 2025-07-02T07:07:24.354Z + 2025-07-02T12:09:19.726Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html - 2025-07-02T07:07:24.738Z + 2025-07-02T12:09:20.114Z https://docs.axolotl.ai/docs/api/loaders.constants.html - 2025-07-02T07:07:24.580Z + 2025-07-02T12:09:19.954Z https://docs.axolotl.ai/docs/api/logging_config.html - 2025-07-02T07:07:24.216Z + 2025-07-02T12:09:19.586Z https://docs.axolotl.ai/docs/api/cli.inference.html - 2025-07-02T07:07:24.392Z + 2025-07-02T12:09:19.764Z https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html - 2025-07-02T07:07:24.618Z + 2025-07-02T12:09:19.993Z https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html - 2025-07-02T07:07:25.489Z + 2025-07-02T12:09:20.824Z https://docs.axolotl.ai/docs/api/utils.schemas.training.html - 2025-07-02T07:07:25.271Z + 2025-07-02T12:09:20.603Z https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html - 2025-07-02T07:07:24.726Z + 2025-07-02T12:09:20.102Z https://docs.axolotl.ai/docs/api/utils.freeze.html - 2025-07-02T07:07:25.115Z + 2025-07-02T12:09:20.476Z https://docs.axolotl.ai/docs/api/loaders.tokenizer.html - 2025-07-02T07:07:24.563Z + 2025-07-02T12:09:19.937Z https://docs.axolotl.ai/docs/api/utils.bench.html - 2025-07-02T07:07:25.107Z + 2025-07-02T12:09:20.469Z https://docs.axolotl.ai/docs/api/utils.quantization.html - 2025-07-02T07:07:25.218Z + 2025-07-02T12:09:20.577Z https://docs.axolotl.ai/docs/batch_vs_grad.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/input_output.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/sequence_parallelism.html - 2025-07-02T07:04:01.016Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/reward_modelling.html - 2025-07-02T07:04:01.016Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/index.html - 2025-07-02T07:04:01.030Z + 2025-07-02T12:06:08.402Z https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html - 2025-07-02T07:04:01.034Z + 2025-07-02T12:06:08.405Z https://docs.axolotl.ai/FAQS.html - 2025-07-02T07:04:01.010Z + 2025-07-02T12:06:08.381Z https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html - 2025-07-02T07:04:01.035Z + 2025-07-02T12:06:08.406Z https://docs.axolotl.ai/TODO.html - 2025-07-02T07:04:01.010Z + 2025-07-02T12:06:08.381Z https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html - 2025-07-02T07:04:01.018Z + 2025-07-02T12:06:08.389Z https://docs.axolotl.ai/docs/torchao.html - 2025-07-02T07:04:01.016Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/ray-integration.html - 2025-07-02T07:04:01.016Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/quantize.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/qat.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/api/utils.lora.html - 2025-07-02T07:07:25.098Z + 2025-07-02T12:09:20.460Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html - 2025-07-02T07:07:24.679Z + 2025-07-02T12:09:20.054Z https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html - 2025-07-02T07:07:25.015Z + 2025-07-02T12:09:20.393Z https://docs.axolotl.ai/docs/api/utils.collators.core.html - 2025-07-02T07:07:25.510Z + 2025-07-02T12:09:20.844Z https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html - 2025-07-02T07:07:24.722Z + 2025-07-02T12:09:20.098Z https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html - 2025-07-02T07:07:25.588Z + 2025-07-02T12:09:20.922Z https://docs.axolotl.ai/docs/api/utils.data.pretraining.html - 2025-07-02T07:07:25.190Z + 2025-07-02T12:09:20.550Z https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html - 2025-07-02T07:07:25.590Z + 2025-07-02T12:09:20.923Z https://docs.axolotl.ai/docs/api/utils.trainer.html - 2025-07-02T07:07:25.133Z + 2025-07-02T12:09:20.493Z https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html - 2025-07-02T07:07:25.471Z + 2025-07-02T12:09:20.805Z https://docs.axolotl.ai/docs/api/utils.schemas.model.html - 2025-07-02T07:07:25.263Z + 2025-07-02T12:09:20.597Z https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html - 2025-07-02T07:07:25.028Z + 2025-07-02T12:09:20.406Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html - 2025-07-02T07:07:24.760Z + 2025-07-02T12:09:20.136Z https://docs.axolotl.ai/docs/api/datasets.html - 2025-07-02T07:07:24.153Z + 2025-07-02T12:09:19.521Z https://docs.axolotl.ai/docs/api/utils.schemas.enums.html - 2025-07-02T07:07:25.328Z + 2025-07-02T12:09:20.661Z https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html - 2025-07-02T07:07:25.479Z + 2025-07-02T12:09:20.814Z https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html - 2025-07-02T07:07:24.999Z + 2025-07-02T12:09:20.376Z https://docs.axolotl.ai/docs/api/utils.collators.batching.html - 2025-07-02T07:07:25.528Z + 2025-07-02T12:09:20.863Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html - 2025-07-02T07:07:24.543Z + 2025-07-02T12:09:19.918Z https://docs.axolotl.ai/docs/api/prompt_strategies.base.html - 2025-07-02T07:07:24.620Z + 2025-07-02T12:09:19.995Z https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html - 2025-07-02T07:07:24.962Z + 2025-07-02T12:09:20.339Z https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html - 2025-07-02T07:07:24.800Z + 2025-07-02T12:09:20.177Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html - 2025-07-02T07:07:24.763Z + 2025-07-02T12:09:20.139Z https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html - 2025-07-02T07:07:24.271Z + 2025-07-02T12:09:19.642Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html - 2025-07-02T07:07:24.595Z + 2025-07-02T12:09:19.970Z https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html - 2025-07-02T07:07:25.104Z + 2025-07-02T12:09:20.465Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html - 2025-07-02T07:07:24.779Z + 2025-07-02T12:09:20.155Z https://docs.axolotl.ai/docs/api/utils.tokenization.html - 2025-07-02T07:07:25.079Z + 2025-07-02T12:09:20.445Z https://docs.axolotl.ai/docs/api/loaders.model.html - 2025-07-02T07:07:24.555Z + 2025-07-02T12:09:19.929Z https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html - 2025-07-02T07:07:25.593Z + 2025-07-02T12:09:20.927Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html - 2025-07-02T07:07:24.531Z + 2025-07-02T12:09:19.906Z https://docs.axolotl.ai/docs/api/cli.main.html - 2025-07-02T07:07:24.318Z + 2025-07-02T12:09:19.689Z https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html - 2025-07-02T07:07:25.597Z + 2025-07-02T12:09:20.930Z https://docs.axolotl.ai/docs/api/utils.chat_templates.html - 2025-07-02T07:07:25.094Z + 2025-07-02T12:09:20.455Z https://docs.axolotl.ai/docs/api/utils.schemas.utils.html - 2025-07-02T07:07:25.334Z + 2025-07-02T12:09:20.666Z https://docs.axolotl.ai/docs/api/common.architectures.html - 2025-07-02T07:07:25.491Z + 2025-07-02T12:09:20.825Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html - 2025-07-02T07:07:24.970Z + 2025-07-02T12:09:20.347Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html - 2025-07-02T07:07:24.665Z + 2025-07-02T12:09:20.041Z https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html - 2025-07-02T07:07:25.578Z + 2025-07-02T12:09:20.912Z https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html - 2025-07-02T07:07:25.472Z + 2025-07-02T12:09:20.807Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html - 2025-07-02T07:07:24.758Z + 2025-07-02T12:09:20.135Z https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html - 2025-07-02T07:07:25.029Z + 2025-07-02T12:09:20.407Z https://docs.axolotl.ai/docs/api/train.html - 2025-07-02T07:07:24.131Z + 2025-07-02T12:09:19.499Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html - 2025-07-02T07:07:25.009Z + 2025-07-02T12:09:20.387Z https://docs.axolotl.ai/docs/api/index.html - 2025-07-02T07:07:24.070Z + 2025-07-02T12:09:19.437Z https://docs.axolotl.ai/docs/api/loaders.adapter.html - 2025-07-02T07:07:24.570Z + 2025-07-02T12:09:19.945Z https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html - 2025-07-02T07:07:25.306Z + 2025-07-02T12:09:20.638Z https://docs.axolotl.ai/docs/api/kernels.lora.html - 2025-07-02T07:07:24.889Z + 2025-07-02T12:09:20.265Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html - 2025-07-02T07:07:24.771Z + 2025-07-02T12:09:20.147Z https://docs.axolotl.ai/docs/api/cli.checks.html - 2025-07-02T07:07:24.360Z + 2025-07-02T12:09:19.732Z https://docs.axolotl.ai/docs/api/cli.quantize.html - 2025-07-02T07:07:24.478Z + 2025-07-02T12:09:19.852Z https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html - 2025-07-02T07:07:25.486Z + 2025-07-02T12:09:20.820Z https://docs.axolotl.ai/docs/api/core.chat.messages.html - 2025-07-02T07:07:24.270Z + 2025-07-02T12:09:19.641Z https://docs.axolotl.ai/docs/api/core.builders.causal.html - 2025-07-02T07:07:24.226Z + 2025-07-02T12:09:19.597Z https://docs.axolotl.ai/docs/api/core.trainers.relora.html - 2025-07-02T07:07:24.514Z + 2025-07-02T12:09:19.889Z https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html - 2025-07-02T07:07:25.508Z + 2025-07-02T12:09:20.843Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html - 2025-07-02T07:07:25.033Z + 2025-07-02T12:09:20.410Z https://docs.axolotl.ai/docs/api/core.trainers.mamba.html - 2025-07-02T07:07:24.510Z + 2025-07-02T12:09:19.884Z https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html - 2025-07-02T07:07:24.287Z + 2025-07-02T12:09:19.658Z https://docs.axolotl.ai/docs/api/loaders.processor.html - 2025-07-02T07:07:24.565Z + 2025-07-02T12:09:19.939Z https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html - 2025-07-02T07:07:24.273Z + 2025-07-02T12:09:19.644Z https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html - 2025-07-02T07:07:24.736Z + 2025-07-02T12:09:20.112Z https://docs.axolotl.ai/docs/api/cli.train.html - 2025-07-02T07:07:24.326Z + 2025-07-02T12:09:19.698Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html - 2025-07-02T07:07:24.585Z + 2025-07-02T12:09:19.960Z https://docs.axolotl.ai/docs/api/utils.collators.mamba.html - 2025-07-02T07:07:25.533Z + 2025-07-02T12:09:20.866Z https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html - 2025-07-02T07:07:25.026Z + 2025-07-02T12:09:20.404Z https://docs.axolotl.ai/docs/api/utils.dict.html - 2025-07-02T07:07:25.181Z + 2025-07-02T12:09:20.541Z https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html - 2025-07-02T07:07:24.686Z + 2025-07-02T12:09:20.062Z https://docs.axolotl.ai/docs/api/core.training_args.html - 2025-07-02T07:07:24.247Z + 2025-07-02T12:09:19.618Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html - 2025-07-02T07:07:24.761Z + 2025-07-02T12:09:20.138Z https://docs.axolotl.ai/docs/api/prompt_tokenizers.html - 2025-07-02T07:07:24.206Z + 2025-07-02T12:09:19.577Z https://docs.axolotl.ai/docs/api/common.const.html - 2025-07-02T07:07:25.492Z + 2025-07-02T12:09:20.827Z https://docs.axolotl.ai/docs/fsdp_qlora.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.384Z https://docs.axolotl.ai/docs/custom_integrations.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/getting-started.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.384Z https://docs.axolotl.ai/docs/faq.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.384Z https://docs.axolotl.ai/docs/lora_optims.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/rlhf.html - 2025-07-02T07:04:01.016Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/amd_hpc.html - 2025-07-02T07:04:01.011Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/installation.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.386Z https://docs.axolotl.ai/docs/multipack.html - 2025-07-02T07:04:01.015Z + 2025-07-02T12:06:08.387Z https://docs.axolotl.ai/docs/dataset_preprocessing.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset_loading.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset-formats/template_free.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset-formats/index.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z https://docs.axolotl.ai/docs/dataset-formats/pretraining.html - 2025-07-02T07:04:01.012Z + 2025-07-02T12:06:08.383Z