diff --git a/.nojekyll b/.nojekyll
index 8bdc3fafe..83b2137f9 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-384ec60d
\ No newline at end of file
+de81b179
\ No newline at end of file
diff --git a/docs/config-reference.html b/docs/config-reference.html
index 6923be3d1..951b38c17 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -1795,654 +1795,659 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
# Specify a custom attention implementation, used mostly for kernels.attn_implementation: str | None
-# Which experts implementation to use for MoE models,
-experts_implementation: str | None
-
-# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with
-# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other
-# backends). Note: total parameter count may be reported incorrectly when enabled
-# (trainable param count is correct).
-quantize_moe_experts: bool = False
-
-# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399
-scaling_softmax: bool | None
-# Scaling factor for SSMax attention. Default is 0.43
-scaling_softmax_factor: float | None
-# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better
-# length generalization.
-scaling_softmax_bias: float | None
-
-unsloth_cross_entropy_loss: bool | None
-unsloth_lora_mlp: bool | None
-unsloth_lora_qkv: bool | None
-unsloth_lora_o: bool | None
-unsloth_rms_norm: bool | None
-unsloth_rope: bool | None
-
-# Apply custom LoRA autograd functions and activation function Triton kernels for speed
-# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
-lora_mlp_kernel: bool | None
-# Apply custom LoRA autograd functions and activation function Triton kernels for speed
-# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
-lora_qkv_kernel: bool | None
-# Apply custom LoRA autograd functions and activation function Triton kernels for speed
-# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
-lora_o_kernel: bool | None
-# Apply custom LoRA autograd function for embedding layers. See:
-# https://docs.axolotl.ai/docs/lora_optims.html
-lora_embedding_kernel: bool | None
-
-# Whether to use chunked cross entropy loss for memory efficiency
-chunked_cross_entropy: bool | None
-# Number of chunks to use for chunked cross entropy loss
-chunked_cross_entropy_num_chunks: int | None
-# Enable Entropy-Aware Focal Training loss (EAFT)
-use_eaft: bool | None
-# Exponent for entropy weighting in EAFT (default: 1.0)
-eaft_alpha: float | None = 1.0
-# Number of top logits for entropy approximation (default: 20)
-eaft_k: int | None = 20
-
-# Whether to use ALST tiled mlp for memory efficient long context
-tiled_mlp: bool | None
-
-# Number of shards to use for ALST tiled mlp. If unset, it will be set based on
-# seqlen/hidden_size
-tiled_mlp_num_shards: int | None
-
-# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on
-# llama.
-tiled_mlp_use_original_mlp: bool | None = True
-
-llama4_linearized_experts: bool | None
-
-# Deepspeed config path. e.g., deepspeed_configs/zero3.json
-deepspeed: str | dict[str, Any] | None
-# Whether to use deepcompile for faster training with deepspeed
-deepcompile: bool | None
-# FSDP configuration
-fsdp: list[str] | None
-
-# FSDP configuration options
-fsdp_config: FSDPConfig | None
- # For FSDPConfig:
- # FSDP version
-fsdp_version: int | None
- # Enable activation checkpointing to reduce memory usage during forward passes
-activation_checkpointing: bool | None
- # Offload parameters to CPU to reduce GPU memory usage
-offload_params: bool | None
- # Synchronize module states across all processes
-sync_module_states: bool | None
- # Enable CPU RAM efficient loading to reduce memory usage during model loading
-cpu_ram_efficient_loading: bool | None
- # Disabling this enables swap memory usage for resource-constrained setups when
- # offload_params is enabled.
-cpu_offload_pin_memory: bool | None
- # Use original parameters instead of flattened parameters
-use_orig_params: bool | None
-
- # Type of state dict to use for saving/loading checkpoints
-state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
- # Final state dict type to use after training completion
-final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+# Use hybrid attention for Gemma 4: flash_attention_2 for sliding window layers and sdpa
+# for global (full_attention) layers. Global layers have head_dim=512 which exceeds
+# flash attention's supported size.
+gemma4_hybrid_attn_impl: bool | None
+
+# Which experts implementation to use for MoE models,
+experts_implementation: str | None
+
+# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with
+# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other
+# backends). Note: total parameter count may be reported incorrectly when enabled
+# (trainable param count is correct).
+quantize_moe_experts: bool = False
+
+# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399
+scaling_softmax: bool | None
+# Scaling factor for SSMax attention. Default is 0.43
+scaling_softmax_factor: float | None
+# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better
+# length generalization.
+scaling_softmax_bias: float | None
+
+unsloth_cross_entropy_loss: bool | None
+unsloth_lora_mlp: bool | None
+unsloth_lora_qkv: bool | None
+unsloth_lora_o: bool | None
+unsloth_rms_norm: bool | None
+unsloth_rope: bool | None
+
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_mlp_kernel: bool | None
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_qkv_kernel: bool | None
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_o_kernel: bool | None
+# Apply custom LoRA autograd function for embedding layers. See:
+# https://docs.axolotl.ai/docs/lora_optims.html
+lora_embedding_kernel: bool | None
+
+# Whether to use chunked cross entropy loss for memory efficiency
+chunked_cross_entropy: bool | None
+# Number of chunks to use for chunked cross entropy loss
+chunked_cross_entropy_num_chunks: int | None
+# Enable Entropy-Aware Focal Training loss (EAFT)
+use_eaft: bool | None
+# Exponent for entropy weighting in EAFT (default: 1.0)
+eaft_alpha: float | None = 1.0
+# Number of top logits for entropy approximation (default: 20)
+eaft_k: int | None = 20
+
+# Whether to use ALST tiled mlp for memory efficient long context
+tiled_mlp: bool | None
+
+# Number of shards to use for ALST tiled mlp. If unset, it will be set based on
+# seqlen/hidden_size
+tiled_mlp_num_shards: int | None
+
+# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on
+# llama.
+tiled_mlp_use_original_mlp: bool | None = True
+
+llama4_linearized_experts: bool | None
+
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
+deepspeed: str | dict[str, Any] | None
+# Whether to use deepcompile for faster training with deepspeed
+deepcompile: bool | None
+# FSDP configuration
+fsdp: list[str] | None
+
+# FSDP configuration options
+fsdp_config: FSDPConfig | None
+ # For FSDPConfig:
+ # FSDP version
+fsdp_version: int | None
+ # Enable activation checkpointing to reduce memory usage during forward passes
+activation_checkpointing: bool | None
+ # Offload parameters to CPU to reduce GPU memory usage
+offload_params: bool | None
+ # Synchronize module states across all processes
+sync_module_states: bool | None
+ # Enable CPU RAM efficient loading to reduce memory usage during model loading
+cpu_ram_efficient_loading: bool | None
+ # Disabling this enables swap memory usage for resource-constrained setups when
+ # offload_params is enabled.
+cpu_offload_pin_memory: bool | None
+ # Use original parameters instead of flattened parameters
+use_orig_params: bool | None
- # Policy for automatically wrapping modules with FSDP
-auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None
- # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')
-transformer_layer_cls_to_wrap: str | None
+ # Type of state dict to use for saving/loading checkpoints
+state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+ # Final state dict type to use after training completion
+final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
- # Reshard parameters after forward pass to save memory
-reshard_after_forward: bool | None
- # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')
-mixed_precision_policy: str | None
+ # Policy for automatically wrapping modules with FSDP
+auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None
+ # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')
+transformer_layer_cls_to_wrap: str | None
-# FSDP version
-fsdp_version: int | None
-fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
-
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for
-# no eval.
-val_set_size: float | None = 0.0
-
-# Number of devices to shard across. If not set, will use all available devices.
-dp_shard_size: int | None
-# Number of devices to replicate across.
-dp_replicate_size: int | None
-# Deprecated: use `context_parallel_size` instead
-sequence_parallel_degree: int | None
-# Set to a divisor of the number of GPUs available to split sequences into chunks of
-# equal size. Use in long context training to prevent OOM when sequences cannot fit into
-# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each
-# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized
-# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more
-# details.
-context_parallel_size: int | None
-# Optional; strides across the key dimension. Larger values use more memory but should
-# make training faster. Must evenly divide the number of KV heads in your model.
-heads_k_stride: int | None
-# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to
-# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing
-# case.
-ring_attn_func: RingAttnFunc | None
-# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.
-tensor_parallel_size: int | None
-
-# Add or change special tokens. If you add tokens here, you don't need to add them to
-# the `tokens` list.
-special_tokens: SpecialTokensConfig | None
- # For SpecialTokensConfig:
-bos_token: str | None
-eos_token: str | None
-pad_token: str | None
-unk_token: str | None
-additional_special_tokens: list[str] | None
-
-# Add extra tokens to the tokenizer
-tokens: list[str] | None
-# Mapping token_id to new_token_string to override reserved added_tokens in the
-# tokenizer. Only works for tokens that are not part of the base vocab (aka are
-# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.
-added_tokens_overrides: dict[int, str] | None
-
-# Whether to use torch.compile and which backend to use. setting to `auto` will enable
-# torch compile when torch>=2.6.0
-torch_compile: Literal['auto'] | bool | None
-# Backend to use for torch.compile
-torch_compile_backend: str | None
-torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None
-
-# Maximum number of iterations to train for. It precedes num_epochs which means that if
-# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>
-# `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-max_steps: int | None
-# Number of warmup steps. Cannot use with warmup_ratio
-warmup_steps: int | None
-# Warmup ratio. Cannot use with warmup_steps
-warmup_ratio: float | None
-# Leave empty to eval at each epoch, integer for every N steps. float for fraction of
-# total steps
-eval_steps: int | float | None
-# Number of times per epoch to run evals, mutually exclusive with eval_steps
-evals_per_epoch: int | None
-# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer
-# from `eval_steps`
-eval_strategy: str | None
-
-# Leave empty to save at each epoch, integer for every N steps. float for fraction of
-# total steps
-save_steps: int | float | None
-# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps
-saves_per_epoch: int | None
-# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better
-# result is achieved, leave empty to infer from `save_steps`
-save_strategy: str | None
-# Checkpoints saved at a time
-save_total_limit: int | None
-# Whether to checkpoint a model after the first step of training. Defaults to False.
-save_first_step: bool | None
-
-# Logging frequency
-logging_steps: int | None
-# Stop training after this many evaluation losses have increased in a row. https://huggi
-# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin
-# gCallback
-early_stopping_patience: int | None
-load_best_model_at_end: bool | None = False
-# Save only the model weights, skipping the optimizer. Using this means you can't resume
-# from checkpoints.
-save_only_model: bool | None = False
-# Use tensorboard for logging
-use_tensorboard: bool | None
-# Enable the pytorch profiler to capture the first N steps of training to the
-# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more
-# information. Snapshots can be visualized @ https://pytorch.org/memory_viz
-profiler_steps: int | None
-# Which step to start the profiler at. Useful for only capturing a few steps mid-run.
-profiler_steps_start: int | None = 0
-# bool of whether to report tokens per second at the end of training. This is not
-# supported with pre-training datasets.
-include_tokens_per_second: bool | None
-# bool of whether to report tokens per second per-gpu during training by measuring
-# throughput of non-padding tokens.
-include_tkps: bool | None = True
-# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to
-# add noise to embeddings. Currently only supported on Llama and Mistral
-neftune_noise_alpha: float | None
-
-# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to
-# `beta` in `ORPOConfig` due to trl mapping.
-orpo_alpha: float | None
-# Target reward margin for the SimPO loss
-simpo_gamma: float | None
-# Weight of the BC regularizer
-cpo_alpha: float | None
-
-# Factor for desirable loss term in KTO loss
-kto_desirable_weight: float | None
-# Factor for undesirable loss term in KTO loss
-kto_undesirable_weight: float | None
-# The beta parameter for the RL training
-rl_beta: float | None
-
-# Defines the max memory usage per gpu on the system. Passed through to transformers
-# when loading the model.
-max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None
-# Limit the memory for all available GPUs to this amount (if an integer, expressed in
-# gigabytes); default: unset
-gpu_memory_limit: int | str | None
-# Whether to use low_cpu_mem_usage
-low_cpu_mem_usage: bool | None
-
-# The name of the chat template to use for training, following values are supported:
-# tokenizer_default: Uses the chat template that is available in the
-# tokenizer_config.json. If the chat template is not available in the tokenizer, it will
-# raise an error. This is the default value.
-# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
-# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
-# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.
-# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not
-# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.
-# The custom jinja template should be provided in the chat_template_jinja field. The
-# selected chat template will be saved to the tokenizer_config.json for easier
-# inferencing
-chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None
-# Custom jinja template or path to jinja file for chat template. This will be only used
-# if chat_template is set to `jinja` or `null` (in which case chat_template is
-# automatically set to `jinja`). Default is null.
-chat_template_jinja: str | None
-# Additional kwargs to pass to the chat template. This is useful for customizing the
-# chat template. For example, you can pass `thinking=False` to add a generation prompt
-# to the chat template.
-chat_template_kwargs: dict[str, Any] | None
-# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the
-# boundaries between conversation turns. For example: ['/INST', '</s>',
-# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is
-# useful for templates that use multiple delimiter tokens.
-eot_tokens: list[str] | None
-# Changes the default system message. Currently only supports chatml.
-default_system_message: str | None
-
-# Token index or indices to adjust embedding weights to the mean of the other tokens.
-# This is useful when the model has untrained embeddings.
-fix_untrained_tokens: int | list[int] | None
-
-is_preprocess: bool | None
-preprocess_iterable: bool | None
-
-# Total number of tokens - internal use
-total_num_tokens: int | None
-total_supervised_tokens: int | None
-# You can set these packing optimizations AFTER starting a training at least once. The
-# trainer will provide recommended values for these values.
-sample_packing_eff_est: float | None
-axolotl_config_path: str | None
-
-# Internal use only - Used to identify which the model is based on
-is_falcon_derived_model: bool | None
-# Internal use only - Used to identify which the model is based on
-is_llama_derived_model: bool | None
-# Internal use only - Used to identify which the model is based on. Please note that if
-# you set this to true, `padding_side` will be set to 'left' by default
-is_mistral_derived_model: bool | None
+ # Reshard parameters after forward pass to save memory
+reshard_after_forward: bool | None
+ # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')
+mixed_precision_policy: str | None
+
+# FSDP version
+fsdp_version: int | None
+fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+
+# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for
+# no eval.
+val_set_size: float | None = 0.0
+
+# Number of devices to shard across. If not set, will use all available devices.
+dp_shard_size: int | None
+# Number of devices to replicate across.
+dp_replicate_size: int | None
+# Deprecated: use `context_parallel_size` instead
+sequence_parallel_degree: int | None
+# Set to a divisor of the number of GPUs available to split sequences into chunks of
+# equal size. Use in long context training to prevent OOM when sequences cannot fit into
+# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each
+# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized
+# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more
+# details.
+context_parallel_size: int | None
+# Optional; strides across the key dimension. Larger values use more memory but should
+# make training faster. Must evenly divide the number of KV heads in your model.
+heads_k_stride: int | None
+# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to
+# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing
+# case.
+ring_attn_func: RingAttnFunc | None
+# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.
+tensor_parallel_size: int | None
+
+# Add or change special tokens. If you add tokens here, you don't need to add them to
+# the `tokens` list.
+special_tokens: SpecialTokensConfig | None
+ # For SpecialTokensConfig:
+bos_token: str | None
+eos_token: str | None
+pad_token: str | None
+unk_token: str | None
+additional_special_tokens: list[str] | None
+
+# Add extra tokens to the tokenizer
+tokens: list[str] | None
+# Mapping token_id to new_token_string to override reserved added_tokens in the
+# tokenizer. Only works for tokens that are not part of the base vocab (aka are
+# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.
+added_tokens_overrides: dict[int, str] | None
+
+# Whether to use torch.compile and which backend to use. setting to `auto` will enable
+# torch compile when torch>=2.6.0
+torch_compile: Literal['auto'] | bool | None
+# Backend to use for torch.compile
+torch_compile_backend: str | None
+torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None
+
+# Maximum number of iterations to train for. It precedes num_epochs which means that if
+# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>
+# `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+max_steps: int | None
+# Number of warmup steps. Cannot use with warmup_ratio
+warmup_steps: int | None
+# Warmup ratio. Cannot use with warmup_steps
+warmup_ratio: float | None
+# Leave empty to eval at each epoch, integer for every N steps. float for fraction of
+# total steps
+eval_steps: int | float | None
+# Number of times per epoch to run evals, mutually exclusive with eval_steps
+evals_per_epoch: int | None
+# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer
+# from `eval_steps`
+eval_strategy: str | None
+
+# Leave empty to save at each epoch, integer for every N steps. float for fraction of
+# total steps
+save_steps: int | float | None
+# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+saves_per_epoch: int | None
+# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better
+# result is achieved, leave empty to infer from `save_steps`
+save_strategy: str | None
+# Checkpoints saved at a time
+save_total_limit: int | None
+# Whether to checkpoint a model after the first step of training. Defaults to False.
+save_first_step: bool | None
+
+# Logging frequency
+logging_steps: int | None
+# Stop training after this many evaluation losses have increased in a row. https://huggi
+# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin
+# gCallback
+early_stopping_patience: int | None
+load_best_model_at_end: bool | None = False
+# Save only the model weights, skipping the optimizer. Using this means you can't resume
+# from checkpoints.
+save_only_model: bool | None = False
+# Use tensorboard for logging
+use_tensorboard: bool | None
+# Enable the pytorch profiler to capture the first N steps of training to the
+# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more
+# information. Snapshots can be visualized @ https://pytorch.org/memory_viz
+profiler_steps: int | None
+# Which step to start the profiler at. Useful for only capturing a few steps mid-run.
+profiler_steps_start: int | None = 0
+# bool of whether to report tokens per second at the end of training. This is not
+# supported with pre-training datasets.
+include_tokens_per_second: bool | None
+# bool of whether to report tokens per second per-gpu during training by measuring
+# throughput of non-padding tokens.
+include_tkps: bool | None = True
+# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to
+# add noise to embeddings. Currently only supported on Llama and Mistral
+neftune_noise_alpha: float | None
+
+# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to
+# `beta` in `ORPOConfig` due to trl mapping.
+orpo_alpha: float | None
+# Target reward margin for the SimPO loss
+simpo_gamma: float | None
+# Weight of the BC regularizer
+cpo_alpha: float | None
+
+# Factor for desirable loss term in KTO loss
+kto_desirable_weight: float | None
+# Factor for undesirable loss term in KTO loss
+kto_undesirable_weight: float | None
+# The beta parameter for the RL training
+rl_beta: float | None
+
+# Defines the max memory usage per gpu on the system. Passed through to transformers
+# when loading the model.
+max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in
+# gigabytes); default: unset
+gpu_memory_limit: int | str | None
+# Whether to use low_cpu_mem_usage
+low_cpu_mem_usage: bool | None
+
+# The name of the chat template to use for training, following values are supported:
+# tokenizer_default: Uses the chat template that is available in the
+# tokenizer_config.json. If the chat template is not available in the tokenizer, it will
+# raise an error. This is the default value.
+# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
+# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
+# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.
+# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not
+# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.
+# The custom jinja template should be provided in the chat_template_jinja field. The
+# selected chat template will be saved to the tokenizer_config.json for easier
+# inferencing
+chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None
+# Custom jinja template or path to jinja file for chat template. This will be only used
+# if chat_template is set to `jinja` or `null` (in which case chat_template is
+# automatically set to `jinja`). Default is null.
+chat_template_jinja: str | None
+# Additional kwargs to pass to the chat template. This is useful for customizing the
+# chat template. For example, you can pass `thinking=False` to add a generation prompt
+# to the chat template.
+chat_template_kwargs: dict[str, Any] | None
+# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the
+# boundaries between conversation turns. For example: ['/INST', '</s>',
+# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is
+# useful for templates that use multiple delimiter tokens.
+eot_tokens: list[str] | None
+# Changes the default system message. Currently only supports chatml.
+default_system_message: str | None
+
+# Token index or indices to adjust embedding weights to the mean of the other tokens.
+# This is useful when the model has untrained embeddings.
+fix_untrained_tokens: int | list[int] | None
+
+is_preprocess: bool | None
+preprocess_iterable: bool | None
+
+# Total number of tokens - internal use
+total_num_tokens: int | None
+total_supervised_tokens: int | None
+# You can set these packing optimizations AFTER starting a training at least once. The
+# trainer will provide recommended values for these values.
+sample_packing_eff_est: float | None
+axolotl_config_path: str | None
+
+# Internal use only - Used to identify which the model is based on
+is_falcon_derived_model: bool | None# Internal use only - Used to identify which the model is based on
-is_qwen_derived_model: bool | None
-
-# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available
-# plugins or doc below for more details.
-# https://docs.axolotl.ai/docs/custom_integrations.html
-plugins: list[str] | None
-# Enable sample generation during training for monitoring
-generate_samples: bool | None = False
-# Number of samples to generate at each interval
-num_generation_samples: int | None = 3
-# Maximum new tokens to generate per sample
-generation_max_new_tokens: int | None = 50
-# Temperature for sample generation (0.0 = greedy)
-generation_temperature: float | None = 0.7
-# Nucleus sampling parameter for generation
-generation_top_p: float | None
-# Top-k sampling parameter for generation
-generation_top_k: int | None
-# Ratio of input to use as prompt (0.0-1.0)
-generation_prompt_ratio: float | None = 0.5
-# Whether to use sampling (vs greedy decoding)
-generation_do_sample: bool | None = True
-
-# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This
-# can also be a relative path to a model on disk
-base_model: str (required)
-# If the base_model repo on hf hub doesn't include configuration .json files, You can
-# set that here, or leave this empty to default to base_model
-base_model_config: str | None
-# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to
-# AutoConfig.
-cls_model_config: str | None
-# Optional tokenizer configuration path in case you want to use a different tokenizer
-# than the one defined in the base model
-tokenizer_config: str | None
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast: bool | None
-# Whether to use the legacy tokenizer setting, defaults to True
-tokenizer_legacy: bool | None
-# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-
-# common tokenizer.
-tokenizer_use_mistral_common: bool | None
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: str | None
-# transformers processor class
-processor_type: str | None
-# Whether to save jinja files for tokenizer, transformers default is True
-tokenizer_save_jinja_files: bool | None = True
-# Trust remote code for untrusted source
-trust_remote_code: bool | None
-
-# Don't move the model to the device before sharding. Set to `false` to revert to legacy
-# behavior.
-experimental_skip_move_to_device: bool | None = True
-
-# Use custom kernels, e.g. MegaBlocks.
-use_kernels: bool | None
-
-# Model loading quantization config
-model_quantization_config: Literal['Mxfp4Config'] | None
-# kwargs for model quantization config
-model_quantization_config_kwargs: dict[str, Any] | None
+is_llama_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on. Please note that if
+# you set this to true, `padding_side` will be set to 'left' by default
+is_mistral_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on
+is_qwen_derived_model: bool | None
+
+# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available
+# plugins or doc below for more details.
+# https://docs.axolotl.ai/docs/custom_integrations.html
+plugins: list[str] | None
+# Enable sample generation during training for monitoring
+generate_samples: bool | None = False
+# Number of samples to generate at each interval
+num_generation_samples: int | None = 3
+# Maximum new tokens to generate per sample
+generation_max_new_tokens: int | None = 50
+# Temperature for sample generation (0.0 = greedy)
+generation_temperature: float | None = 0.7
+# Nucleus sampling parameter for generation
+generation_top_p: float | None
+# Top-k sampling parameter for generation
+generation_top_k: int | None
+# Ratio of input to use as prompt (0.0-1.0)
+generation_prompt_ratio: float | None = 0.5
+# Whether to use sampling (vs greedy decoding)
+generation_do_sample: bool | None = True
+
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This
+# can also be a relative path to a model on disk
+base_model: str (required)
+# If the base_model repo on hf hub doesn't include configuration .json files, You can
+# set that here, or leave this empty to default to base_model
+base_model_config: str | None
+# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to
+# AutoConfig.
+cls_model_config: str | None
+# Optional tokenizer configuration path in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config: str | None
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast: bool | None
+# Whether to use the legacy tokenizer setting, defaults to True
+tokenizer_legacy: bool | None
+# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-
+# common tokenizer.
+tokenizer_use_mistral_common: bool | None
+# Corresponding tokenizer for the model AutoTokenizer is a good choice
+tokenizer_type: str | None
+# transformers processor class
+processor_type: str | None
+# Whether to save jinja files for tokenizer, transformers default is True
+tokenizer_save_jinja_files: bool | None = True
+# Trust remote code for untrusted source
+trust_remote_code: bool | None
+
+# Don't move the model to the device before sharding. Set to `false` to revert to legacy
+# behavior.
+experimental_skip_move_to_device: bool | None = True
+
+# Use custom kernels, e.g. MegaBlocks.
+use_kernels: bool | None
-# Where to save the full-finetuned model to
-output_dir: str = ./model-out
-# push checkpoints to hub
-hub_model_id: str | None
-# how to push checkpoints to hub
-hub_strategy: str | None
-# branch/revision to push to on hub (default: main)
-hub_revision: str | None
-# Whether to save the model using safetensors format. Defaults to True.
-save_safetensors: bool | None = True
-
-# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: bool | None = False
-# Use bitsandbytes 4 bit
-load_in_4bit: bool | None = False
+# Model loading quantization config
+model_quantization_config: Literal['Mxfp4Config'] | None
+# kwargs for model quantization config
+model_quantization_config_kwargs: dict[str, Any] | None
+
+# Where to save the full-finetuned model to
+output_dir: str = ./model-out
+# push checkpoints to hub
+hub_model_id: str | None
+# how to push checkpoints to hub
+hub_strategy: str | None
+# branch/revision to push to on hub (default: main)
+hub_revision: str | None
+# Whether to save the model using safetensors format. Defaults to True.
+save_safetensors: bool | None = True
-# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all
-# parameters in original model
-adapter: Literal['lora', 'qlora', 'llama-adapter'] | None
-# If you already have a lora model trained that you want to load, put that here. This
-# means after training, if you want to test the model, you should set this to the value
-# of `output_dir`. Note that if you merge an adapter to the base model, a new
-# subdirectory `merged` will be created under the `output_dir`.
-lora_model_dir: str | None
-lora_r: int | None
-lora_alpha: int | None
-lora_fan_in_fan_out: bool | None
-lora_target_modules: str | list[str] | None
-lora_target_parameters: str | list[str] | None
-# If true, will target all linear modules
-lora_target_linear: bool | None
-# If you added new tokens to the tokenizer, you may need to save some LoRA modules
-# because they need to know the new tokens. For LLaMA and Mistral, you need to save
-# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts
-# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-lora_modules_to_save: list[str] | None
-lora_dropout: float | None = 0.0
-# The layer indices to transform, otherwise, apply to all layers
-peft_layers_to_transform: list[int] | None
-peft_layers_pattern: list[str] | None
-
-peft: PeftConfig | None
- # For PeftConfig:
- # Configuration options for loftq initialization for LoRA
-loftq_config: LoftQConfig | None
- # For LoftQConfig:
- # typically 4 bits
-loftq_bits: int = 4
-
-# Whether to use DoRA.
-peft_use_dora: bool | None
-# Whether to use RSLoRA.
-peft_use_rslora: bool | None
-# List of layer indices to replicate.
-peft_layer_replication: list[tuple[int, int]] | None
-# How to initialize LoRA weights. Default to True which is MS original implementation.
-peft_init_lora_weights: bool | str | None
-# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict
-# mapping an embedding layer name to its trainable token indices. See
-# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-
-# tokens-alongside-lora
-peft_trainable_token_indices: list[int] | dict[str, list[int]] | None
-# Whether to tie adapter weights for tied model weights. See
-# https://github.com/huggingface/peft/issues/2864
-peft_ensure_weight_tying: bool | None
-# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.
-peft_autocast_adapter_dtype: bool | None
-
-# load qlora model in sharded format for FSDP using answer.ai technique.
-qlora_sharded_model_loading: bool | None = False
-# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it
-# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
-lora_on_cpu: bool | None
-# Whether you are training a 4-bit GPTQ quantized model
-gptq: bool | None
-# optional overrides to the bnb 4bit quantization configuration
-bnb_config_kwargs: dict[str, Any] | None
-
-# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
-loraplus_lr_ratio: float | None
-# loraplus learning rate for lora embedding layers. Default value is 1e-6.
-loraplus_lr_embedding: float | None = 1e-06
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+load_in_8bit: bool | None = False
+# Use bitsandbytes 4 bit
+load_in_4bit: bool | None = False
+
+# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all
+# parameters in original model
+adapter: Literal['lora', 'qlora', 'llama-adapter'] | None
+# If you already have a lora model trained that you want to load, put that here. This
+# means after training, if you want to test the model, you should set this to the value
+# of `output_dir`. Note that if you merge an adapter to the base model, a new
+# subdirectory `merged` will be created under the `output_dir`.
+lora_model_dir: str | None
+lora_r: int | None
+lora_alpha: int | None
+lora_fan_in_fan_out: bool | None
+lora_target_modules: str | list[str] | None
+lora_target_parameters: str | list[str] | None
+# If true, will target all linear modules
+lora_target_linear: bool | None
+# If you added new tokens to the tokenizer, you may need to save some LoRA modules
+# because they need to know the new tokens. For LLaMA and Mistral, you need to save
+# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts
+# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+lora_modules_to_save: list[str] | None
+lora_dropout: float | None = 0.0
+# The layer indices to transform, otherwise, apply to all layers
+peft_layers_to_transform: list[int] | None
+peft_layers_pattern: list[str] | None
+
+peft: PeftConfig | None
+ # For PeftConfig:
+ # Configuration options for loftq initialization for LoRA
+loftq_config: LoftQConfig | None
+ # For LoftQConfig:
+ # typically 4 bits
+loftq_bits: int = 4
+
+# Whether to use DoRA.
+peft_use_dora: bool | None
+# Whether to use RSLoRA.
+peft_use_rslora: bool | None
+# List of layer indices to replicate.
+peft_layer_replication: list[tuple[int, int]] | None
+# How to initialize LoRA weights. Default to True which is MS original implementation.
+peft_init_lora_weights: bool | str | None
+# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict
+# mapping an embedding layer name to its trainable token indices. See
+# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-
+# tokens-alongside-lora
+peft_trainable_token_indices: list[int] | dict[str, list[int]] | None
+# Whether to tie adapter weights for tied model weights. See
+# https://github.com/huggingface/peft/issues/2864
+peft_ensure_weight_tying: bool | None
+# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.
+peft_autocast_adapter_dtype: bool | None
+
+# load qlora model in sharded format for FSDP using answer.ai technique.
+qlora_sharded_model_loading: bool | None = False
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it
+# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: bool | None
+# Whether you are training a 4-bit GPTQ quantized model
+gptq: bool | None
+# optional overrides to the bnb 4bit quantization configuration
+bnb_config_kwargs: dict[str, Any] | None
-merge_lora: bool | None
-# Method to use for LoRA merging. 'memory_efficient' (default) processes shards
-# individually to reduce memory usage, 'legacy' loads the full model into memory.
-merge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient
+# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
+loraplus_lr_ratio: float | None
+# loraplus learning rate for lora embedding layers. Default value is 1e-6.
+loraplus_lr_embedding: float | None = 1e-06
-# Whether to use ReLoRA. Use with jagged_restart_*steps options.
-relora: bool | None
-# threshold for optimizer magnitude when pruning
-relora_prune_ratio: float | None
-# True to perform lora weight merges on cpu during restarts, for modest gpu memory
-# savings
-relora_cpu_offload: bool | None
-
-# how often to reset for jagged restarts
-jagged_restart_steps: int | None
-# how many warmup steps to take after reset for jagged restarts
-jagged_restart_warmup_steps: int | None
-# how many anneal steps to take before reset for jagged restarts
-jagged_restart_anneal_steps: int | None
-
-# If greater than 1, backpropagation will be skipped and the gradients will be
-# accumulated for the given number of steps.
-gradient_accumulation_steps: int | None = 1
-# The number of samples to include in each batch. This is the number of samples sent to
-# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps
-micro_batch_size: int | None = 1
-# Total batch size, we do not recommended setting this manually
-batch_size: int | None
-# per gpu micro batch size for evals, defaults to value of micro_batch_size
-eval_batch_size: int | None
-
-# whether to find batch size that fits in memory. Passed to underlying transformers
-# Trainer
-auto_find_batch_size: bool | None
-
-# Whether to mask out or include the human's prompt from the training labels
-train_on_inputs: bool | None = False
-# Group similarly sized data to minimize padding. May be slower to start, as it must
-# download and sort the entire dataset. Note that training loss may have an oscillating
-# pattern with this enabled.
-group_by_length: bool | None
-
-learning_rate: str | float (required)
-embedding_lr: float | None
-embedding_lr_scale: float | None
-# Specify weight decay
-weight_decay: float | None = 0.0
-# Specify optimizer
-optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED
-# Dictionary of arguments to pass to the optimizer
-optim_args: str | dict[str, Any] | None
-# The target modules to optimize, i.e. the module names that you would like to train,
-# right now this is used only for GaLore algorithm
-optim_target_modules: list[str] | Literal['all_linear'] | None
-# Path to torch distx for optim 'adamw_anyprecision'
-torchdistx_path: str | None
-lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE
-# Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler_kwargs: dict[str, Any] | None
-lr_quadratic_warmup: bool | None
-# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of
-# peak lr
-cosine_min_lr_ratio: float | None
-# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means
-# start cosine_min_lr at 80% of training step
-cosine_constant_lr_ratio: float | None
-# Learning rate div factor
-lr_div_factor: float | None
-
-lr_groups: list[LrGroup] | None
- # For LrGroup:
-name: str (required)
-modules: list[str] (required)
-lr: float (required)
-
-# adamw hyperparams
-adam_epsilon: float | None
-# only used for CAME Optimizer
-adam_epsilon2: float | None
-# adamw hyperparams
-adam_beta1: float | None
-# adamw hyperparams
-adam_beta2: float | None
-# only used for CAME Optimizer
-adam_beta3: float | None
-
-# Dion Optimizer learning rate
-dion_lr: float | None
-# Dion Optimizer momentum
-dion_momentum: float | None
-# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank
-# dimension.
-dion_rank_fraction: float | None = 1.0
-# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may
-# be useful to ensure even sharding.
-dion_rank_multiple_of: int | None = 1
-
-# Gradient clipping max norm
-max_grad_norm: float | None
-num_epochs: float = 1.0
-
-use_wandb: bool | None
-# Set the name of your wandb run
-wandb_name: str | None
-# Set the ID of your wandb run
-wandb_run_id: str | None
-# "offline" to save run metadata locally and not sync to the server, "disabled" to turn
-# off wandb
-wandb_mode: str | None
-# Your wandb project name
-wandb_project: str | None
-# A wandb Team name if using a Team
-wandb_entity: str | None
-wandb_watch: str | None
-# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only
-# at the end of training
-wandb_log_model: str | None
-
-use_mlflow: bool | None
-# URI to mlflow
-mlflow_tracking_uri: str | None
-# Your experiment name
-mlflow_experiment_name: str | None
-# Your run name
-mlflow_run_name: str | None
-# set to true to copy each saved checkpoint on each save to mlflow artifact registry
-hf_mlflow_log_artifacts: bool | None
-
-# Enable or disable Comet integration.
-use_comet: bool | None
-# API key for Comet. Recommended to set via `comet login`.
-comet_api_key: str | None
-# Workspace name in Comet. Defaults to the user's default workspace.
-comet_workspace: str | None
-# Project name in Comet. Defaults to Uncategorized.
-comet_project_name: str | None
-# Identifier for the experiment. Used to append data to an existing experiment or
-# control the key of new experiments. Default to a random key.
-comet_experiment_key: str | None
-# Create a new experiment ("create") or log to an existing one ("get"). Default
-# ("get_or_create") auto-selects based on configuration.
-comet_mode: str | None
-# Set to True to log data to Comet server, or False for offline storage. Default is
-# True.
-comet_online: bool | None
-# Dictionary for additional configuration settings, see the doc for more details.
-comet_experiment_config: dict[str, Any] | None
-
-use_trackio: bool | None
-# Your trackio project name
-trackio_project_name: str | None
-# Set the name of your trackio run
-trackio_run_name: str | None
-# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)
-trackio_space_id: str | None
-
-# Enable OpenTelemetry metrics collection and Prometheus export
-use_otel_metrics: bool | None = False
-# Host to bind the OpenTelemetry metrics server to
-otel_metrics_host: str | None = localhost
-# Port for the Prometheus metrics HTTP server
-otel_metrics_port: int | None = 8000
-
-# the number of activate layers in LISA
-lisa_n_layers: int | None
-# how often to switch layers in LISA
-lisa_step_interval: int | None
-# path under the model to access the layers
-lisa_layers_attribute: str | None = model.layers
-
-gradio_title: str | None
-gradio_share: bool | None
-gradio_server_name: str | None
-gradio_server_port: int | None
-gradio_max_new_tokens: int | None
-gradio_temperature: float | None
-
-use_ray: bool = False
-ray_run_name: str | None
-ray_num_workers: int = 1
-resources_per_worker: dict
+merge_lora: bool | None
+# Method to use for LoRA merging. 'memory_efficient' (default) processes shards
+# individually to reduce memory usage, 'legacy' loads the full model into memory.
+merge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient
+
+# Whether to use ReLoRA. Use with jagged_restart_*steps options.
+relora: bool | None
+# threshold for optimizer magnitude when pruning
+relora_prune_ratio: float | None
+# True to perform lora weight merges on cpu during restarts, for modest gpu memory
+# savings
+relora_cpu_offload: bool | None
+
+# how often to reset for jagged restarts
+jagged_restart_steps: int | None
+# how many warmup steps to take after reset for jagged restarts
+jagged_restart_warmup_steps: int | None
+# how many anneal steps to take before reset for jagged restarts
+jagged_restart_anneal_steps: int | None
+
+# If greater than 1, backpropagation will be skipped and the gradients will be
+# accumulated for the given number of steps.
+gradient_accumulation_steps: int | None = 1
+# The number of samples to include in each batch. This is the number of samples sent to
+# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps
+micro_batch_size: int | None = 1
+# Total batch size, we do not recommended setting this manually
+batch_size: int | None
+# per gpu micro batch size for evals, defaults to value of micro_batch_size
+eval_batch_size: int | None
+
+# whether to find batch size that fits in memory. Passed to underlying transformers
+# Trainer
+auto_find_batch_size: bool | None
+
+# Whether to mask out or include the human's prompt from the training labels
+train_on_inputs: bool | None = False
+# Group similarly sized data to minimize padding. May be slower to start, as it must
+# download and sort the entire dataset. Note that training loss may have an oscillating
+# pattern with this enabled.
+group_by_length: bool | None
+
+learning_rate: str | float (required)
+embedding_lr: float | None
+embedding_lr_scale: float | None
+# Specify weight decay
+weight_decay: float | None = 0.0
+# Specify optimizer
+optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED
+# Dictionary of arguments to pass to the optimizer
+optim_args: str | dict[str, Any] | None
+# The target modules to optimize, i.e. the module names that you would like to train,
+# right now this is used only for GaLore algorithm
+optim_target_modules: list[str] | Literal['all_linear'] | None
+# Path to torch distx for optim 'adamw_anyprecision'
+torchdistx_path: str | None
+lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE
+# Specify a scheduler and kwargs to use with the optimizer
+lr_scheduler_kwargs: dict[str, Any] | None
+lr_quadratic_warmup: bool | None
+# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of
+# peak lr
+cosine_min_lr_ratio: float | None
+# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means
+# start cosine_min_lr at 80% of training step
+cosine_constant_lr_ratio: float | None
+# Learning rate div factor
+lr_div_factor: float | None
+
+lr_groups: list[LrGroup] | None
+ # For LrGroup:
+name: str (required)
+modules: list[str] (required)
+lr: float (required)
+
+# adamw hyperparams
+adam_epsilon: float | None
+# only used for CAME Optimizer
+adam_epsilon2: float | None
+# adamw hyperparams
+adam_beta1: float | None
+# adamw hyperparams
+adam_beta2: float | None
+# only used for CAME Optimizer
+adam_beta3: float | None
+
+# Dion Optimizer learning rate
+dion_lr: float | None
+# Dion Optimizer momentum
+dion_momentum: float | None
+# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank
+# dimension.
+dion_rank_fraction: float | None = 1.0
+# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may
+# be useful to ensure even sharding.
+dion_rank_multiple_of: int | None = 1
+
+# Gradient clipping max norm
+max_grad_norm: float | None
+num_epochs: float = 1.0
+
+use_wandb: bool | None
+# Set the name of your wandb run
+wandb_name: str | None
+# Set the ID of your wandb run
+wandb_run_id: str | None
+# "offline" to save run metadata locally and not sync to the server, "disabled" to turn
+# off wandb
+wandb_mode: str | None
+# Your wandb project name
+wandb_project: str | None
+# A wandb Team name if using a Team
+wandb_entity: str | None
+wandb_watch: str | None
+# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only
+# at the end of training
+wandb_log_model: str | None
+
+use_mlflow: bool | None
+# URI to mlflow
+mlflow_tracking_uri: str | None
+# Your experiment name
+mlflow_experiment_name: str | None
+# Your run name
+mlflow_run_name: str | None
+# set to true to copy each saved checkpoint on each save to mlflow artifact registry
+hf_mlflow_log_artifacts: bool | None
+
+# Enable or disable Comet integration.
+use_comet: bool | None
+# API key for Comet. Recommended to set via `comet login`.
+comet_api_key: str | None
+# Workspace name in Comet. Defaults to the user's default workspace.
+comet_workspace: str | None
+# Project name in Comet. Defaults to Uncategorized.
+comet_project_name: str | None
+# Identifier for the experiment. Used to append data to an existing experiment or
+# control the key of new experiments. Default to a random key.
+comet_experiment_key: str | None
+# Create a new experiment ("create") or log to an existing one ("get"). Default
+# ("get_or_create") auto-selects based on configuration.
+comet_mode: str | None
+# Set to True to log data to Comet server, or False for offline storage. Default is
+# True.
+comet_online: bool | None
+# Dictionary for additional configuration settings, see the doc for more details.
+comet_experiment_config: dict[str, Any] | None
+
+use_trackio: bool | None
+# Your trackio project name
+trackio_project_name: str | None
+# Set the name of your trackio run
+trackio_run_name: str | None
+# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)
+trackio_space_id: str | None
+
+# Enable OpenTelemetry metrics collection and Prometheus export
+use_otel_metrics: bool | None = False
+# Host to bind the OpenTelemetry metrics server to
+otel_metrics_host: str | None = localhost
+# Port for the Prometheus metrics HTTP server
+otel_metrics_port: int | None = 8000
+
+# the number of activate layers in LISA
+lisa_n_layers: int | None
+# how often to switch layers in LISA
+lisa_step_interval: int | None
+# path under the model to access the layers
+lisa_layers_attribute: str | None = model.layers
+
+gradio_title: str | None
+gradio_share: bool | None
+gradio_server_name: str | None
+gradio_server_port: int | None
+gradio_max_new_tokens: int | None
+gradio_temperature: float | None
-# The size of the image to resize to. It can be an integer (resized into padded-square
-# image) or a tuple (width, height).If not provided, we will attempt to load from
-# preprocessor.size, otherwise, images won't be resized.
-image_size: int | tuple[int, int] | None
-# The resampling algorithm to use for image resizing. Default is bilinear. Please refer
-# to PIL.Image.Resampling for more details.
-image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None
-
-# optional overrides to the base model configuration
-overrides_of_model_config: dict[str, Any] | None
-# optional overrides the base model loading from_pretrained
-overrides_of_model_kwargs: dict[str, Any] | None
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good
-# choice too
-type_of_model: str | None
-# You can specify to choose a specific model revision from huggingface hub
-revision_of_model: str | None
-
-max_packed_sequence_len: int | None
-rope_scaling: Any | None
-noisy_embedding_alpha: float | None
-dpo_beta: float | None
-evaluation_strategy: str | None
-eval_table_size: int | None
-eval_max_new_tokens: int | None
-dpo_use_logits_to_keep: bool | None
-dpo_generate_during_eval: bool | None
-dpo_norm_loss: bool | None
-rpo_alpha: float | None
+use_ray: bool = False
+ray_run_name: str | None
+ray_num_workers: int = 1
+resources_per_worker: dict
+
+# The size of the image to resize to. It can be an integer (resized into padded-square
+# image) or a tuple (width, height).If not provided, we will attempt to load from
+# preprocessor.size, otherwise, images won't be resized.
+image_size: int | tuple[int, int] | None
+# The resampling algorithm to use for image resizing. Default is bilinear. Please refer
+# to PIL.Image.Resampling for more details.
+image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None
+
+# optional overrides to the base model configuration
+overrides_of_model_config: dict[str, Any] | None
+# optional overrides the base model loading from_pretrained
+overrides_of_model_kwargs: dict[str, Any] | None
+# If you want to specify the type of model to load, AutoModelForCausalLM is a good
+# choice too
+type_of_model: str | None
+# You can specify to choose a specific model revision from huggingface hub
+revision_of_model: str | None
+
+max_packed_sequence_len: int | None
+rope_scaling: Any | None
+noisy_embedding_alpha: float | None
+dpo_beta: float | None
+evaluation_strategy: str | None
+eval_table_size: int | None
+eval_max_new_tokens: int | None
+dpo_use_logits_to_keep: bool | None
+dpo_generate_during_eval: bool | None
+dpo_norm_loss: bool | None
+rpo_alpha: float | None
diff --git a/search.json b/search.json
index f588ca744..5d4854380 100644
--- a/search.json
+++ b/search.json
@@ -6030,7 +6030,7 @@
"href": "docs/config-reference.html",
"title": "Config Reference",
"section": "",
- "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo', 'ebft'\nrl: RLType | None\n\ntrl: TRLConfig | None\n # For TRLConfig:\n # Beta parameter for the RL training. Same as `rl_beta`. Use\n beta: float | None\n # Maximum length of the completion for RL training.\n max_completion_length: int | None\n\n # Whether to use VLLM for RL training.\n use_vllm: bool = False\n # VLLM mode to use, one of 'server' or 'colocate'\n vllm_mode: Literal['server', 'colocate'] | None\n # Host of the vLLM server to connect to.\n vllm_server_host: str | None = 0.0.0.0\n # Port of the vLLM server to connect to.\n vllm_server_port: int | None = 8000\n # Total timeout (in seconds) to wait for the vLLM server to respond.\n vllm_server_timeout: int | None\n # Regex for vLLM guided decoding.\n vllm_guided_decoding_regex: str | None\n\n # List of reward functions to load. Paths must be importable from current dir.\n reward_funcs: list[str] | None\n # List of reward weights for the reward functions.\n reward_weights: list[float] | None\n # Batch size for generation. Controls how many unique prompts are generated per step.\n # Should be num_generations * data_parallel_size for full DP utilization.\n generation_batch_size: int | None\n # Number of generations to sample.\n num_generations: int | None\n # Whether to log completions.\n log_completions: bool | None = False\n # Number of completions to print when log_completions is True.\n num_completions_to_print: int | None\n # Controls whether importance sampling ratios are computed at the `'token'` or\n # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n # the original GRPO paper.\n importance_sampling_level: Literal['sequence', 'token'] | None\n\n # Whether to sync the reference model.\n sync_ref_model: bool | None = False\n # Mixup alpha for the reference model.\n ref_model_mixup_alpha: float | None = 0.9\n # Sync steps for the reference model.\n ref_model_sync_steps: int | None = 64\n # Whether to scale rewards by their standard deviation.\n scale_rewards: bool = True\n\n # Sampling temperature for the GRPO policy.\n temperature: float | None\n # Top-p sampling probability for the generation policy.\n top_p: float | None\n # Top-k sampling for the generation policy.\n top_k: int | None\n # Minimum probability for the generation policy.\n min_p: float | None\n # Penalty for tokens that appear in prompt and generated text.\n repetition_penalty: float | None\n # Additional generation parameters passed to vLLM SamplingParams. Useful for\n # stop_token_ids, seed, frequency_penalty, etc.\n generation_kwargs: dict[str, Any] | None\n # Additional kwargs for the chat template. E.g., {enable_thinking: false} for Qwen3.5\n # models.\n chat_template_kwargs: dict[str, Any] | None\n # Number of iterations per batch (μ) for GRPO.\n num_iterations: int | None\n # Epsilon value for clipping in the GRPO algorithm.\n epsilon: float | None\n # Upper-bound epsilon value for clipping in the GRPO algorithm.\n epsilon_high: float | None\n # Whether to use Liger loss for GRPO.\n use_liger_loss: bool | None\n # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n loss_type: str | None\n # Whether to exclude truncated completions from loss calculation.\n mask_truncated_completions: bool = False\n # Enable sleep mode for vLLM to offload VRAM when idle\n vllm_enable_sleep_mode: bool | None\n # Path to custom rollout function. Must be importable from current dir.\n rollout_func: str | None\n # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n # normalizes each reward independently, then sums.\n multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\n # Use the GRPODataProducer protocol for online data generation.\n use_data_producer: bool = False\n # Generate rollouts in a background thread while training on the previous rollout.\n async_prefetch: bool = False\n # Number of rollouts to prefetch ahead of training.\n prefetch_depth: int | None\n # Sync model weights to vLLM every N optimizer steps (async mode only).\n vllm_sync_interval: int | None\n # Score prompt groups incrementally instead of the full batch at once.\n streaming_partial_batch: bool | None\n # Minimum prompt groups to score per streaming chunk.\n streaming_min_groups: int | None\n # Apply IS correction for distribution mismatch between vLLM and training model.\n vllm_importance_sampling_correction: bool | None\n # IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask.\n vllm_importance_sampling_mode: Literal['token_truncate', 'token_mask', 'sequence_truncate', 'sequence_mask'] | None\n # Cap C for IS ratio clipping/masking.\n vllm_importance_sampling_cap: float | None\n # KL threshold for off-policy sequence masking (OPSM). None = disabled.\n off_policy_mask_threshold: float | None\n # Apply IS correction to KL divergence term.\n use_bias_correction_kl: bool | None\n\n # Number of persistent subprocess workers for parallel reward computation. Each worker\n # has its own main thread so signal.alarm() (used by math_verify) works correctly.\n # Work is sharded across workers by prompt groups. Only used with\n # use_data_producer=True and non-nn.Module reward functions.\n reward_num_workers: int = 1\n # [Experimental, disabled by default] Size of the replay buffer for storing high-\n # signal rollout groups. When > 0, groups with reward variance are cached and used to\n # replace zero-signal groups (where all rewards are identical). Set to 0 to disable.\n # Only used with use_data_producer=True.\n replay_buffer_size: int = 0\n # When True (default), recompute old_per_token_logps for replayed groups using the\n # current training model. This fixes the importance sampling mismatch that occurs when\n # replaying stale data. Only relevant when replay_buffer_size > 0.\n replay_recompute_logps: bool = True\n # Fraction of total training steps after which deferred re-rolling begins. Zero-signal\n # prompts (where all rewards in a group are identical) are buffered and re-injected\n # into later batches when the model is more likely to solve them. Set to 1.0 to\n # disable. Only used with use_data_producer=True.\n reroll_start_fraction: float = 1.0\n # Maximum number of prompt groups to replace with re-roll candidates per batch. Higher\n # values increase data utilization but reduce prompt diversity. Only used with\n # use_data_producer=True.\n reroll_max_groups: int = 1\n # When True, skip gradient computation for micro-batches where all advantages are zero\n # (no learning signal). This avoids the forward/backward pass entirely when no\n # learning signal is present. The step is logged with skipped_zero_adv_batches=1 for\n # monitoring.\n skip_zero_advantage_batches: bool = True\n # Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. Auto-\n # selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged\n # model.\n vllm_lora_sync: bool = False\n\nvllm: VllmConfig | None\n # For VllmConfig:\n # Device to use for VLLM\n device: str | None = auto\n # Tensor parallel size for VLLM\n tensor_parallel_size: int | None\n # Data parallel size for VLLM\n data_parallel_size: int | None\n # GPU memory utilization for VLLM\n gpu_memory_utilization: float | None = 0.9\n # Data type for VLLM\n dtype: str | None = auto\n # Maximum length of the model context for VLLM\n max_model_len: int | None\n # Enable prefix caching for VLLM\n enable_prefix_caching: bool | None\n # Host for the vLLM server to start on\n host: str | None = 0.0.0.0\n # Port of the vLLM server to start on\n port: int | None = 8000\n\n # Enable reasoning for VLLM\n enable_reasoning: bool | None\n # Reasoning parser for VLLM\n reasoning_parser: str | None\n # Disable CUDA graph capture in vLLM. Required for models with causal_conv1d (e.g.,\n # Qwen3.5 hybrid linear attention).\n enforce_eager: bool | None\n # Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' for\n # native LoRA support, or leave None for default TRL serve.\n serve_module: str | None\n # vLLM worker extension class for weight synchronization. Defaults to\n # 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'.\n worker_extension_cls: str | None\n\n# Configuration for Energy-Based Fine-Tuning (EBFT)\nebft: EBFTConfig | None\n # For EBFTConfig:\n # Fractional layer depths for feature extraction (e.g., [0.25, 0.5, 0.75])\n feature_layers: list[float] = [0.25, 0.5, 0.75]\n # Embedding method: 'last_token', 'mean_pooling', 'completion_mean', or 'concat'\n embed_method: Literal['last_token', 'mean_pooling', 'completion_mean', 'concat'] = last_token\n # Apply SVD whitening to feature embeddings\n use_whitening: bool = False\n # Coefficient for alignment reward (cosine similarity with ground truth)\n alignment_coef: float = 1.0\n # Coefficient for diversity penalty (pairwise similarity between samples)\n diversity_coef: float = 1.0\n # Cross-entropy loss coefficient on ground-truth tokens\n ce_coef: float = 0.0\n # Set per-batch max_tokens based on ground-truth length\n adaptive_max_tokens: bool = True\n # Multiplier for ground-truth token count when computing adaptive max_tokens\n gt_length_multiplier: float = 1.5\n\n # EBFT mode: 'structured' (QA with vLLM) or 'strided' (unstructured text)\n mode: Literal['structured', 'strided'] = structured\n # Stride between anchor points (tokens)\n stride: int = 8\n # Context window size per block\n context_length: int = 8\n # Tokens to generate per block\n generate_max_len: int = 8\n # Independent rollouts per document\n n_samples_per_prompt: int = 4\n # Sampling temperature for strided generation\n temperature: float = 0.6\n # Top-p nucleus sampling threshold\n top_p: float = 1.0\n # RL policy gradient loss coefficient\n rl_coef: float = 1.0\n # Advantage estimator: 'rloo', 'group_norm', 'reinforce'\n advantage_estimator: Literal['rloo', 'group_norm', 'reinforce'] = rloo\n # Minimum tokens into completion before placing anchors. Skips anchors too close to\n # the prompt boundary where features are dominated by prompt context.\n min_completion_prefix: int = 0\n\nqat: QATConfig | None\n # For QATConfig:\n # Fake quantization layout to use for activation quantization.\n activation_dtype: TorchAOQuantDType | None\n # Fake quantization layout to use for weight quantization.\n weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n # Quantize embedding\n quantize_embedding: bool | None = False\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n # The number of steps to apply fake quantization after\n fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n # For PTQConfig:\n # Fake quantization layout to use for weight quantization.\n weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n # Fake quantization layout to use for activation quantization.\n activation_dtype: TorchAOQuantDType | None\n # Whether to quantize the embedding layer.\n quantize_embedding: bool | None\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n # For DynamicCheckpointConfig:\n # Enable dynamic checkpoint triggering during training. Create a file\n # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n enabled: bool = False\n # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n check_interval: int = 10\n # Custom trigger filename (optional). If not specified, defaults to\n # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n # default.\n trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_label_smoothing: float | None\n# Precompute reference model log probabilities for DPO\nprecompute_ref_log_probs: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n # jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n # For `completion` datasets only, uses the provided field instead of `text` column\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n # Key containing the reasoning trace (default: \"reasoning_content\").\n field_thinking: str | None\n # The key the chat template expects that indicates the reasoning trace.\n template_thinking_key: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n # For SyntheticDataset:\n path: Literal['synthetic'] = synthetic\n type: Literal['_synthetic'] = _synthetic\n # Number of rows to generate\n length: int = 1000\n # Sequence length per row (defaults to sequence_len from config)\n sequence_length: int | None\n # Minimum token ID for generation\n min_input_id: int = 100\n # Maximum token ID for generation (defaults to tokenizer vocab_size)\n max_input_id: int | None\n # Random seed for reproducibility\n seed: int | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n # jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n # For `completion` datasets only, uses the provided field instead of `text` column\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n # Key containing the reasoning trace (default: \"reasoning_content\").\n field_thinking: str | None\n # The key the chat template expects that indicates the reasoning trace.\n template_thinking_key: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n # For SyntheticDataset:\n path: Literal['synthetic'] = synthetic\n type: Literal['_synthetic'] = _synthetic\n # Number of rows to generate\n length: int = 1000\n # Sequence length per row (defaults to sequence_len from config)\n sequence_length: int | None\n # Minimum token ID for generation\n min_input_id: int = 100\n # Maximum token ID for generation (defaults to tokenizer vocab_size)\n max_input_id: int | None\n # Random seed for reproducibility\n seed: int | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n # For PretrainingDataset:\n name: str | None\n path: str | None\n split: str | None = train\n text_column: str | None = text\n type: str | None = pretrain\n trust_remote_code: bool | None = False\n data_files: str | None\n skip: int | None\n\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n # jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n # For `completion` datasets only, uses the provided field instead of `text` column\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n # Key containing the reasoning trace (default: \"reasoning_content\").\n field_thinking: str | None\n # The key the chat template expects that indicates the reasoning trace.\n template_thinking_key: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require >=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require >=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# bool to use CUDA tf32 or 'auto' for automatic detection - require >=ampere\ntf32: Literal['auto'] | bool | None = auto\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n# Offload model layer parameters to CPU during forward, prefetch back during backward.\nlayer_offloading: bool | None = False\n\n# Freeze multimodal encoder parameters (vision, audio, etc.) for text-only training of\n# multimodal models. When True, parameters belonging to vision towers, audio towers,\n# multimodal projectors, and similar non-language modules are frozen\n# (requires_grad=False). This allows DDP training without\n# ddp_find_unused_parameters=True.\nfreeze_mm_modules: bool | None\n\n# List of regex patterns for parameter names to keep unfrozen. All other parameters will\n# be frozen via requires_grad=False. Note: range-based patterns (e.g.\n# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so\n# weight decay will still apply to the frozen portion and optimizer states are allocated\n# for the full parameter.\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (<%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with\n# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other\n# backends). Note: total parameter count may be reported incorrectly when enabled\n# (trainable param count is correct).\nquantize_moe_experts: bool = False\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n# Apply custom LoRA autograd function for embedding layers. See:\n# https://docs.axolotl.ai/docs/lora_optims.html\nlora_embedding_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n # For FSDPConfig:\n # FSDP version\n fsdp_version: int | None\n # Enable activation checkpointing to reduce memory usage during forward passes\n activation_checkpointing: bool | None\n # Offload parameters to CPU to reduce GPU memory usage\n offload_params: bool | None\n # Synchronize module states across all processes\n sync_module_states: bool | None\n # Enable CPU RAM efficient loading to reduce memory usage during model loading\n cpu_ram_efficient_loading: bool | None\n # Disabling this enables swap memory usage for resource-constrained setups when\n # offload_params is enabled.\n cpu_offload_pin_memory: bool | None\n # Use original parameters instead of flattened parameters\n use_orig_params: bool | None\n\n # Type of state dict to use for saving/loading checkpoints\n state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n # Final state dict type to use after training completion\n final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n # Policy for automatically wrapping modules with FSDP\n auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n transformer_layer_cls_to_wrap: str | None\n\n # Reshard parameters after forward pass to save memory\n reshard_after_forward: bool | None\n # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n # For SpecialTokensConfig:\n bos_token: str | None\n eos_token: str | None\n pad_token: str | None\n unk_token: str | None\n additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch>=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '</s>',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n# Enable sample generation during training for monitoring\ngenerate_samples: bool | None = False\n# Number of samples to generate at each interval\nnum_generation_samples: int | None = 3\n# Maximum new tokens to generate per sample\ngeneration_max_new_tokens: int | None = 50\n# Temperature for sample generation (0.0 = greedy)\ngeneration_temperature: float | None = 0.7\n# Nucleus sampling parameter for generation\ngeneration_top_p: float | None\n# Top-k sampling parameter for generation\ngeneration_top_k: int | None\n# Ratio of input to use as prompt (0.0-1.0)\ngeneration_prompt_ratio: float | None = 0.5\n# Whether to use sampling (vs greedy decoding)\ngeneration_do_sample: bool | None = True\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all\n# parameters in original model\nadapter: Literal['lora', 'qlora', 'llama-adapter'] | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n # For PeftConfig:\n # Configuration options for loftq initialization for LoRA\n loftq_config: LoftQConfig | None\n # For LoftQConfig:\n # typically 4 bits\n loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n# Method to use for LoRA merging. 'memory_efficient' (default) processes shards\n# individually to reduce memory usage, 'legacy' loads the full model into memory.\nmerge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n # For LrGroup:\n name: str (required)\n modules: list[str] (required)\n lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None\neval_table_size: int | None\neval_max_new_tokens: int | None\ndpo_use_logits_to_keep: bool | None\ndpo_generate_during_eval: bool | None\ndpo_norm_loss: bool | None\nrpo_alpha: float | None",
+ "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo', 'ebft'\nrl: RLType | None\n\ntrl: TRLConfig | None\n # For TRLConfig:\n # Beta parameter for the RL training. Same as `rl_beta`. Use\n beta: float | None\n # Maximum length of the completion for RL training.\n max_completion_length: int | None\n\n # Whether to use VLLM for RL training.\n use_vllm: bool = False\n # VLLM mode to use, one of 'server' or 'colocate'\n vllm_mode: Literal['server', 'colocate'] | None\n # Host of the vLLM server to connect to.\n vllm_server_host: str | None = 0.0.0.0\n # Port of the vLLM server to connect to.\n vllm_server_port: int | None = 8000\n # Total timeout (in seconds) to wait for the vLLM server to respond.\n vllm_server_timeout: int | None\n # Regex for vLLM guided decoding.\n vllm_guided_decoding_regex: str | None\n\n # List of reward functions to load. Paths must be importable from current dir.\n reward_funcs: list[str] | None\n # List of reward weights for the reward functions.\n reward_weights: list[float] | None\n # Batch size for generation. Controls how many unique prompts are generated per step.\n # Should be num_generations * data_parallel_size for full DP utilization.\n generation_batch_size: int | None\n # Number of generations to sample.\n num_generations: int | None\n # Whether to log completions.\n log_completions: bool | None = False\n # Number of completions to print when log_completions is True.\n num_completions_to_print: int | None\n # Controls whether importance sampling ratios are computed at the `'token'` or\n # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n # the original GRPO paper.\n importance_sampling_level: Literal['sequence', 'token'] | None\n\n # Whether to sync the reference model.\n sync_ref_model: bool | None = False\n # Mixup alpha for the reference model.\n ref_model_mixup_alpha: float | None = 0.9\n # Sync steps for the reference model.\n ref_model_sync_steps: int | None = 64\n # Whether to scale rewards by their standard deviation.\n scale_rewards: bool = True\n\n # Sampling temperature for the GRPO policy.\n temperature: float | None\n # Top-p sampling probability for the generation policy.\n top_p: float | None\n # Top-k sampling for the generation policy.\n top_k: int | None\n # Minimum probability for the generation policy.\n min_p: float | None\n # Penalty for tokens that appear in prompt and generated text.\n repetition_penalty: float | None\n # Additional generation parameters passed to vLLM SamplingParams. Useful for\n # stop_token_ids, seed, frequency_penalty, etc.\n generation_kwargs: dict[str, Any] | None\n # Additional kwargs for the chat template. E.g., {enable_thinking: false} for Qwen3.5\n # models.\n chat_template_kwargs: dict[str, Any] | None\n # Number of iterations per batch (μ) for GRPO.\n num_iterations: int | None\n # Epsilon value for clipping in the GRPO algorithm.\n epsilon: float | None\n # Upper-bound epsilon value for clipping in the GRPO algorithm.\n epsilon_high: float | None\n # Whether to use Liger loss for GRPO.\n use_liger_loss: bool | None\n # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n loss_type: str | None\n # Whether to exclude truncated completions from loss calculation.\n mask_truncated_completions: bool = False\n # Enable sleep mode for vLLM to offload VRAM when idle\n vllm_enable_sleep_mode: bool | None\n # Path to custom rollout function. Must be importable from current dir.\n rollout_func: str | None\n # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n # normalizes each reward independently, then sums.\n multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\n # Use the GRPODataProducer protocol for online data generation.\n use_data_producer: bool = False\n # Generate rollouts in a background thread while training on the previous rollout.\n async_prefetch: bool = False\n # Number of rollouts to prefetch ahead of training.\n prefetch_depth: int | None\n # Sync model weights to vLLM every N optimizer steps (async mode only).\n vllm_sync_interval: int | None\n # Score prompt groups incrementally instead of the full batch at once.\n streaming_partial_batch: bool | None\n # Minimum prompt groups to score per streaming chunk.\n streaming_min_groups: int | None\n # Apply IS correction for distribution mismatch between vLLM and training model.\n vllm_importance_sampling_correction: bool | None\n # IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask.\n vllm_importance_sampling_mode: Literal['token_truncate', 'token_mask', 'sequence_truncate', 'sequence_mask'] | None\n # Cap C for IS ratio clipping/masking.\n vllm_importance_sampling_cap: float | None\n # KL threshold for off-policy sequence masking (OPSM). None = disabled.\n off_policy_mask_threshold: float | None\n # Apply IS correction to KL divergence term.\n use_bias_correction_kl: bool | None\n\n # Number of persistent subprocess workers for parallel reward computation. Each worker\n # has its own main thread so signal.alarm() (used by math_verify) works correctly.\n # Work is sharded across workers by prompt groups. Only used with\n # use_data_producer=True and non-nn.Module reward functions.\n reward_num_workers: int = 1\n # [Experimental, disabled by default] Size of the replay buffer for storing high-\n # signal rollout groups. When > 0, groups with reward variance are cached and used to\n # replace zero-signal groups (where all rewards are identical). Set to 0 to disable.\n # Only used with use_data_producer=True.\n replay_buffer_size: int = 0\n # When True (default), recompute old_per_token_logps for replayed groups using the\n # current training model. This fixes the importance sampling mismatch that occurs when\n # replaying stale data. Only relevant when replay_buffer_size > 0.\n replay_recompute_logps: bool = True\n # Fraction of total training steps after which deferred re-rolling begins. Zero-signal\n # prompts (where all rewards in a group are identical) are buffered and re-injected\n # into later batches when the model is more likely to solve them. Set to 1.0 to\n # disable. Only used with use_data_producer=True.\n reroll_start_fraction: float = 1.0\n # Maximum number of prompt groups to replace with re-roll candidates per batch. Higher\n # values increase data utilization but reduce prompt diversity. Only used with\n # use_data_producer=True.\n reroll_max_groups: int = 1\n # When True, skip gradient computation for micro-batches where all advantages are zero\n # (no learning signal). This avoids the forward/backward pass entirely when no\n # learning signal is present. The step is logged with skipped_zero_adv_batches=1 for\n # monitoring.\n skip_zero_advantage_batches: bool = True\n # Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. Auto-\n # selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged\n # model.\n vllm_lora_sync: bool = False\n\nvllm: VllmConfig | None\n # For VllmConfig:\n # Device to use for VLLM\n device: str | None = auto\n # Tensor parallel size for VLLM\n tensor_parallel_size: int | None\n # Data parallel size for VLLM\n data_parallel_size: int | None\n # GPU memory utilization for VLLM\n gpu_memory_utilization: float | None = 0.9\n # Data type for VLLM\n dtype: str | None = auto\n # Maximum length of the model context for VLLM\n max_model_len: int | None\n # Enable prefix caching for VLLM\n enable_prefix_caching: bool | None\n # Host for the vLLM server to start on\n host: str | None = 0.0.0.0\n # Port of the vLLM server to start on\n port: int | None = 8000\n\n # Enable reasoning for VLLM\n enable_reasoning: bool | None\n # Reasoning parser for VLLM\n reasoning_parser: str | None\n # Disable CUDA graph capture in vLLM. Required for models with causal_conv1d (e.g.,\n # Qwen3.5 hybrid linear attention).\n enforce_eager: bool | None\n # Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' for\n # native LoRA support, or leave None for default TRL serve.\n serve_module: str | None\n # vLLM worker extension class for weight synchronization. Defaults to\n # 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'.\n worker_extension_cls: str | None\n\n# Configuration for Energy-Based Fine-Tuning (EBFT)\nebft: EBFTConfig | None\n # For EBFTConfig:\n # Fractional layer depths for feature extraction (e.g., [0.25, 0.5, 0.75])\n feature_layers: list[float] = [0.25, 0.5, 0.75]\n # Embedding method: 'last_token', 'mean_pooling', 'completion_mean', or 'concat'\n embed_method: Literal['last_token', 'mean_pooling', 'completion_mean', 'concat'] = last_token\n # Apply SVD whitening to feature embeddings\n use_whitening: bool = False\n # Coefficient for alignment reward (cosine similarity with ground truth)\n alignment_coef: float = 1.0\n # Coefficient for diversity penalty (pairwise similarity between samples)\n diversity_coef: float = 1.0\n # Cross-entropy loss coefficient on ground-truth tokens\n ce_coef: float = 0.0\n # Set per-batch max_tokens based on ground-truth length\n adaptive_max_tokens: bool = True\n # Multiplier for ground-truth token count when computing adaptive max_tokens\n gt_length_multiplier: float = 1.5\n\n # EBFT mode: 'structured' (QA with vLLM) or 'strided' (unstructured text)\n mode: Literal['structured', 'strided'] = structured\n # Stride between anchor points (tokens)\n stride: int = 8\n # Context window size per block\n context_length: int = 8\n # Tokens to generate per block\n generate_max_len: int = 8\n # Independent rollouts per document\n n_samples_per_prompt: int = 4\n # Sampling temperature for strided generation\n temperature: float = 0.6\n # Top-p nucleus sampling threshold\n top_p: float = 1.0\n # RL policy gradient loss coefficient\n rl_coef: float = 1.0\n # Advantage estimator: 'rloo', 'group_norm', 'reinforce'\n advantage_estimator: Literal['rloo', 'group_norm', 'reinforce'] = rloo\n # Minimum tokens into completion before placing anchors. Skips anchors too close to\n # the prompt boundary where features are dominated by prompt context.\n min_completion_prefix: int = 0\n\nqat: QATConfig | None\n # For QATConfig:\n # Fake quantization layout to use for activation quantization.\n activation_dtype: TorchAOQuantDType | None\n # Fake quantization layout to use for weight quantization.\n weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n # Quantize embedding\n quantize_embedding: bool | None = False\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n # The number of steps to apply fake quantization after\n fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n # For PTQConfig:\n # Fake quantization layout to use for weight quantization.\n weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n # Fake quantization layout to use for activation quantization.\n activation_dtype: TorchAOQuantDType | None\n # Whether to quantize the embedding layer.\n quantize_embedding: bool | None\n # The number of elements in each group for per-group fake quantization\n group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n # For DynamicCheckpointConfig:\n # Enable dynamic checkpoint triggering during training. Create a file\n # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n enabled: bool = False\n # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n check_interval: int = 10\n # Custom trigger filename (optional). If not specified, defaults to\n # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n # default.\n trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_label_smoothing: float | None\n# Precompute reference model log probabilities for DPO\nprecompute_ref_log_probs: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n # jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n # For `completion` datasets only, uses the provided field instead of `text` column\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n # Key containing the reasoning trace (default: \"reasoning_content\").\n field_thinking: str | None\n # The key the chat template expects that indicates the reasoning trace.\n template_thinking_key: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n # For SyntheticDataset:\n path: Literal['synthetic'] = synthetic\n type: Literal['_synthetic'] = _synthetic\n # Number of rows to generate\n length: int = 1000\n # Sequence length per row (defaults to sequence_len from config)\n sequence_length: int | None\n # Minimum token ID for generation\n min_input_id: int = 100\n # Maximum token ID for generation (defaults to tokenizer vocab_size)\n max_input_id: int | None\n # Random seed for reproducibility\n seed: int | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n # jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n # For `completion` datasets only, uses the provided field instead of `text` column\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n # Key containing the reasoning trace (default: \"reasoning_content\").\n field_thinking: str | None\n # The key the chat template expects that indicates the reasoning trace.\n template_thinking_key: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n # For DPODataset:\n path: str | None\n split: str | None\n type: UserDefinedDPOType | str | None\n # For UserDefinedDPOType:\n field_system: str | None\n field_prompt: str | None\n field_chosen: str | None\n field_rejected: str | None\n prompt_format: str | None\n chosen_format: str | None\n rejected_format: str | None\n data_files: list[str] | None\n revision: str | None\n field_messages: str | None\n\n # For KTODataset:\n path: str | None\n split: str | None\n type: UserDefinedKTOType | str | None\n # For UserDefinedKTOType:\n field_system: str | None\n field_prompt: str | None\n field_completion: str | None\n field_label: bool | None\n prompt_format: str | None\n completion_format: str | None\n data_files: list[str] | None\n trust_remote_code: bool | None = False\n revision: str | None\n\n # For StepwiseSupervisedDataset:\n path: str | None\n split: str | None\n data_files: list[str] | None\n revision: str | None\n step_separator: str | None\n max_completion_length: int | None\n train_on_last_step_only: bool | None\n\n # For SyntheticDataset:\n path: Literal['synthetic'] = synthetic\n type: Literal['_synthetic'] = _synthetic\n # Number of rows to generate\n length: int = 1000\n # Sequence length per row (defaults to sequence_len from config)\n sequence_length: int | None\n # Minimum token ID for generation\n min_input_id: int = 100\n # Maximum token ID for generation (defaults to tokenizer vocab_size)\n max_input_id: int | None\n # Random seed for reproducibility\n seed: int | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n # For PretrainingDataset:\n name: str | None\n path: str | None\n split: str | None = train\n text_column: str | None = text\n type: str | None = pretrain\n trust_remote_code: bool | None = False\n data_files: str | None\n skip: int | None\n\n # For SFTDataset:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n path: str | None\n # name of dataset split to load from\n split: str | None\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: str | UserDefinedPrompterType | None\n # For UserDefinedPrompterType:\n # Custom user instruction prompt\n system_prompt: str | None\n # Use {system} as key to be replaced\n system_format: str | None\n field_system: str | None\n field_instruction: str | None\n field_input: str | None\n field_output: str | None\n\n # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n # be replaced. 'format' can include {input}\n format: str | None\n # 'no_input_format' cannot include {input}\n no_input_format: str | None\n input_transform: str | None\n # split dataset into N pieces (use with shards_idx)\n shards: int | None\n # the index of sharded dataset to use\n shards_idx: int | None\n # process dataset in N sequential chunks for memory efficiency (exclusive with\n # `shards`)\n preprocess_shards: int | None\n conversation: str | None\n\n # The name of the chat template to use for training, following values are supported:\n # tokenizer_default: Uses the chat template that is available in the\n # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n # will raise an error. This is the default.\n # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n # template. The custom jinja template should be provided in the chat_template_jinja\n # field.\n chat_template: ChatTemplate | str | None\n # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n # jinja` or empty.\n chat_template_jinja: str | None\n # path to source data files\n data_files: str | list[str] | None\n input_format: str | None\n # name of dataset configuration to load\n name: str | None\n # defines the datatype when path is a file\n ds_type: str | None\n # For `completion` datasets only, uses the provided field instead of `text` column\n field: str | None\n field_human: str | None\n field_model: str | None\n # Key containing the messages (default: \"messages\")\n field_messages: str | None\n # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n # schema](https://json-schema.org/learn/getting-started-step-by-step).\n field_tools: str | None\n # Key containing the reasoning trace (default: \"reasoning_content\").\n field_thinking: str | None\n # The key the chat template expects that indicates the reasoning trace.\n template_thinking_key: str | None\n\n message_field_role: str | None\n\n message_field_content: str | None\n # Mapping of properties from the input dataset to the chat template. (default:\n # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n # in the template but not in this mapping, the system will attempt to load it directly\n # from the message using the property name as the key. Example: In the mapping below,\n # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n # used as 'content' in the chat template.\n message_property_mappings: dict[str, str] | None\n # The key in the message turn that indicates via boolean whether tokens of a turn\n # should be considered for training. Useful to selectively train on certain turns\n # besides the `roles_to_train`.\n message_field_training: str | None\n # The key in the message turn that contains the training details. Useful to\n # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n # containing `begin_offset` (start character index in content), `end_offset` (end\n # character index in content), and `train` (boolean whether to train).\n message_field_training_detail: str | None\n # (for Qwen3 template only) Whether to split the assistant content based on a\n # reasoning trace inside delimited tags\n split_thinking: bool | None\n logprobs_field: str | None\n temperature: float | None\n # Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: list[str] | None\n # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n # turn, last: train on the last EOS token in the conversation\n train_on_eos: Literal['all', 'turn', 'last'] | None\n # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n # source roles will be mapped to the target role. The default is: user: [\"human\",\n # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n roles: dict[str, list[str]] | None\n # Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If\n # you wish to, we recommend using a custom jinja template with the default system\n # message removed or adding a system turn with empty content.\n drop_system_message: bool | None\n # Trust remote code for untrusted source\n trust_remote_code: bool | None = False\n # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n # This can be a commit hash, tag, or branch name. If not specified, the latest version\n # will be used. This parameter is ignored for local datasets.\n revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require >=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require >=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# bool to use CUDA tf32 or 'auto' for automatic detection - require >=ampere\ntf32: Literal['auto'] | bool | None = auto\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n# Offload model layer parameters to CPU during forward, prefetch back during backward.\nlayer_offloading: bool | None = False\n\n# Freeze multimodal encoder parameters (vision, audio, etc.) for text-only training of\n# multimodal models. When True, parameters belonging to vision towers, audio towers,\n# multimodal projectors, and similar non-language modules are frozen\n# (requires_grad=False). This allows DDP training without\n# ddp_find_unused_parameters=True.\nfreeze_mm_modules: bool | None\n\n# List of regex patterns for parameter names to keep unfrozen. All other parameters will\n# be frozen via requires_grad=False. Note: range-based patterns (e.g.\n# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so\n# weight decay will still apply to the frozen portion and optimizer states are allocated\n# for the full parameter.\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (<%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Use hybrid attention for Gemma 4: flash_attention_2 for sliding window layers and sdpa\n# for global (full_attention) layers. Global layers have head_dim=512 which exceeds\n# flash attention's supported size.\ngemma4_hybrid_attn_impl: bool | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with\n# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other\n# backends). Note: total parameter count may be reported incorrectly when enabled\n# (trainable param count is correct).\nquantize_moe_experts: bool = False\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n# Apply custom LoRA autograd function for embedding layers. See:\n# https://docs.axolotl.ai/docs/lora_optims.html\nlora_embedding_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n # For FSDPConfig:\n # FSDP version\n fsdp_version: int | None\n # Enable activation checkpointing to reduce memory usage during forward passes\n activation_checkpointing: bool | None\n # Offload parameters to CPU to reduce GPU memory usage\n offload_params: bool | None\n # Synchronize module states across all processes\n sync_module_states: bool | None\n # Enable CPU RAM efficient loading to reduce memory usage during model loading\n cpu_ram_efficient_loading: bool | None\n # Disabling this enables swap memory usage for resource-constrained setups when\n # offload_params is enabled.\n cpu_offload_pin_memory: bool | None\n # Use original parameters instead of flattened parameters\n use_orig_params: bool | None\n\n # Type of state dict to use for saving/loading checkpoints\n state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n # Final state dict type to use after training completion\n final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n # Policy for automatically wrapping modules with FSDP\n auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n transformer_layer_cls_to_wrap: str | None\n\n # Reshard parameters after forward pass to save memory\n reshard_after_forward: bool | None\n # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n # For SpecialTokensConfig:\n bos_token: str | None\n eos_token: str | None\n pad_token: str | None\n unk_token: str | None\n additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch>=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '</s>',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n# Enable sample generation during training for monitoring\ngenerate_samples: bool | None = False\n# Number of samples to generate at each interval\nnum_generation_samples: int | None = 3\n# Maximum new tokens to generate per sample\ngeneration_max_new_tokens: int | None = 50\n# Temperature for sample generation (0.0 = greedy)\ngeneration_temperature: float | None = 0.7\n# Nucleus sampling parameter for generation\ngeneration_top_p: float | None\n# Top-k sampling parameter for generation\ngeneration_top_k: int | None\n# Ratio of input to use as prompt (0.0-1.0)\ngeneration_prompt_ratio: float | None = 0.5\n# Whether to use sampling (vs greedy decoding)\ngeneration_do_sample: bool | None = True\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all\n# parameters in original model\nadapter: Literal['lora', 'qlora', 'llama-adapter'] | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n # For PeftConfig:\n # Configuration options for loftq initialization for LoRA\n loftq_config: LoftQConfig | None\n # For LoftQConfig:\n # typically 4 bits\n loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n# Method to use for LoRA merging. 'memory_efficient' (default) processes shards\n# individually to reduce memory usage, 'legacy' loads the full model into memory.\nmerge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n # For LrGroup:\n name: str (required)\n modules: list[str] (required)\n lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None\neval_table_size: int | None\neval_max_new_tokens: int | None\ndpo_use_logits_to_keep: bool | None\ndpo_generate_during_eval: bool | None\ndpo_norm_loss: bool | None\nrpo_alpha: float | None",
"crumbs": [
"Getting Started",
"Config Reference"
diff --git a/sitemap.xml b/sitemap.xml
index 214a8b600..e703f3c5d 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,990 +2,990 @@
https://docs.axolotl.ai/FAQS.html
- 2026-04-10T21:09:28.593Z
+ 2026-04-12T14:30:57.589Zhttps://docs.axolotl.ai/docs/qat.html
- 2026-04-10T21:09:28.601Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/models/seed-oss.html
- 2026-04-10T21:13:06.633Z
+ 2026-04-12T14:34:42.455Zhttps://docs.axolotl.ai/docs/models/internvl3_5.html
- 2026-04-10T21:13:06.625Z
+ 2026-04-12T14:34:42.447Zhttps://docs.axolotl.ai/docs/models/apertus.html
- 2026-04-10T21:13:06.632Z
+ 2026-04-12T14:34:42.454Zhttps://docs.axolotl.ai/docs/models/mistral.html
- 2026-04-10T21:13:06.630Z
+ 2026-04-12T14:34:42.452Zhttps://docs.axolotl.ai/docs/models/smolvlm2.html
- 2026-04-10T21:13:06.634Z
+ 2026-04-12T14:34:42.455Zhttps://docs.axolotl.ai/docs/models/arcee.html
- 2026-04-10T21:13:06.626Z
+ 2026-04-12T14:34:42.448Zhttps://docs.axolotl.ai/docs/models/ministral3/vision.html
- 2026-04-10T21:13:06.627Z
+ 2026-04-12T14:34:42.449Zhttps://docs.axolotl.ai/docs/models/kimi-linear.html
- 2026-04-10T21:13:06.624Z
+ 2026-04-12T14:34:42.446Zhttps://docs.axolotl.ai/docs/models/mimo.html
- 2026-04-10T21:13:06.625Z
+ 2026-04-12T14:34:42.447Zhttps://docs.axolotl.ai/docs/models/phi.html
- 2026-04-10T21:13:06.634Z
+ 2026-04-12T14:34:42.455Zhttps://docs.axolotl.ai/docs/models/qwen3.html
- 2026-04-10T21:13:06.632Z
+ 2026-04-12T14:34:42.453Zhttps://docs.axolotl.ai/docs/models/hunyuan.html
- 2026-04-10T21:13:06.635Z
+ 2026-04-12T14:34:42.456Zhttps://docs.axolotl.ai/docs/models/ministral.html
- 2026-04-10T21:13:06.629Z
+ 2026-04-12T14:34:42.451Zhttps://docs.axolotl.ai/docs/models/gemma3n.html
- 2026-04-10T21:13:06.632Z
+ 2026-04-12T14:34:42.454Zhttps://docs.axolotl.ai/docs/models/devstral.html
- 2026-04-10T21:13:06.630Z
+ 2026-04-12T14:34:42.452Zhttps://docs.axolotl.ai/docs/models/magistral/think.html
- 2026-04-10T21:13:06.628Z
+ 2026-04-12T14:34:42.450Zhttps://docs.axolotl.ai/docs/models/qwen3-next.html
- 2026-04-10T21:13:06.631Z
+ 2026-04-12T14:34:42.453Zhttps://docs.axolotl.ai/docs/training_stability.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/expert_quantization.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/rlhf.html
- 2026-04-10T21:09:28.601Z
+ 2026-04-12T14:30:57.598Zhttps://docs.axolotl.ai/docs/dataset-formats/conversation.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/dataset-formats/template_free.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/dataset-formats/tokenized.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/gradient_checkpointing.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/checkpoint_saving.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html
- 2026-04-10T21:12:42.922Z
+ 2026-04-12T14:34:17.501Zhttps://docs.axolotl.ai/docs/api/core.chat.format.chatml.html
- 2026-04-10T21:12:42.241Z
+ 2026-04-12T14:34:16.814Zhttps://docs.axolotl.ai/docs/api/cli.quantize.html
- 2026-04-10T21:12:42.452Z
+ 2026-04-12T14:34:17.026Zhttps://docs.axolotl.ai/docs/api/utils.lora.html
- 2026-04-10T21:12:43.214Z
+ 2026-04-12T14:34:17.796Zhttps://docs.axolotl.ai/docs/api/loaders.model.html
- 2026-04-10T21:12:42.624Z
+ 2026-04-12T14:34:17.201Zhttps://docs.axolotl.ai/docs/api/cli.merge_lora.html
- 2026-04-10T21:12:42.422Z
+ 2026-04-12T14:34:16.996Zhttps://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html
- 2026-04-10T21:12:42.578Z
+ 2026-04-12T14:34:17.154Zhttps://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html
- 2026-04-10T21:12:43.147Z
+ 2026-04-12T14:34:17.726Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.completion.html
- 2026-04-10T21:12:42.831Z
+ 2026-04-12T14:34:17.409Zhttps://docs.axolotl.ai/docs/api/convert.html
- 2026-04-10T21:12:42.109Z
+ 2026-04-12T14:34:16.681Zhttps://docs.axolotl.ai/docs/api/utils.schemas.datasets.html
- 2026-04-10T21:12:43.436Z
+ 2026-04-12T14:34:18.019Zhttps://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html
- 2026-04-10T21:12:42.243Z
+ 2026-04-12T14:34:16.816Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html
- 2026-04-10T21:12:42.871Z
+ 2026-04-12T14:34:17.450Zhttps://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html
- 2026-04-10T21:12:42.611Z
+ 2026-04-12T14:34:17.187Zhttps://docs.axolotl.ai/docs/api/kernels.quantize.html
- 2026-04-10T21:12:43.062Z
+ 2026-04-12T14:34:17.640Zhttps://docs.axolotl.ai/docs/api/integrations.liger.args.html
- 2026-04-10T21:12:43.703Z
+ 2026-04-12T14:34:18.288Zhttps://docs.axolotl.ai/docs/api/cli.preprocess.html
- 2026-04-10T21:12:42.446Z
+ 2026-04-12T14:34:17.020Zhttps://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html
- 2026-04-10T21:12:43.844Z
+ 2026-04-12T14:34:18.431Zhttps://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html
- 2026-04-10T21:12:43.833Z
+ 2026-04-12T14:34:18.420Zhttps://docs.axolotl.ai/docs/api/monkeypatch.utils.html
- 2026-04-10T21:12:43.126Z
+ 2026-04-12T14:34:17.704Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html
- 2026-04-10T21:12:42.964Z
+ 2026-04-12T14:34:17.540Zhttps://docs.axolotl.ai/docs/api/utils.chat_templates.html
- 2026-04-10T21:12:43.209Z
+ 2026-04-12T14:34:17.790Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html
- 2026-04-10T21:12:42.906Z
+ 2026-04-12T14:34:17.485Zhttps://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
- 2026-04-10T21:12:43.199Z
+ 2026-04-12T14:34:17.779Zhttps://docs.axolotl.ai/docs/api/cli.checks.html
- 2026-04-10T21:12:42.366Z
+ 2026-04-12T14:34:16.939Zhttps://docs.axolotl.ai/docs/api/utils.schemas.utils.html
- 2026-04-10T21:12:43.496Z
+ 2026-04-12T14:34:18.080Zhttps://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html
- 2026-04-10T21:12:43.119Z
+ 2026-04-12T14:34:17.698Zhttps://docs.axolotl.ai/docs/api/utils.schemas.model.html
- 2026-04-10T21:12:43.403Z
+ 2026-04-12T14:34:17.986Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html
- 2026-04-10T21:12:42.866Z
+ 2026-04-12T14:34:17.445Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html
- 2026-04-10T21:12:42.969Z
+ 2026-04-12T14:34:17.545Zhttps://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html
- 2026-04-10T21:12:43.341Z
+ 2026-04-12T14:34:17.924Zhttps://docs.axolotl.ai/docs/api/utils.bench.html
- 2026-04-10T21:12:43.227Z
+ 2026-04-12T14:34:17.808Zhttps://docs.axolotl.ai/docs/api/utils.callbacks.qat.html
- 2026-04-10T21:12:43.852Z
+ 2026-04-12T14:34:18.440Zhttps://docs.axolotl.ai/docs/api/core.trainers.base.html
- 2026-04-10T21:12:42.543Z
+ 2026-04-12T14:34:17.119Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html
- 2026-04-10T21:12:42.908Z
+ 2026-04-12T14:34:17.487Zhttps://docs.axolotl.ai/docs/api/core.chat.messages.html
- 2026-04-10T21:12:42.239Z
+ 2026-04-12T14:34:16.812Zhttps://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html
- 2026-04-10T21:12:43.707Z
+ 2026-04-12T14:34:18.292Zhttps://docs.axolotl.ai/docs/api/core.trainers.mamba.html
- 2026-04-10T21:12:42.569Z
+ 2026-04-12T14:34:17.145Zhttps://docs.axolotl.ai/docs/api/train.html
- 2026-04-10T21:12:42.072Z
+ 2026-04-12T14:34:16.642Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html
- 2026-04-10T21:12:42.857Z
+ 2026-04-12T14:34:17.436Zhttps://docs.axolotl.ai/docs/api/cli.inference.html
- 2026-04-10T21:12:42.411Z
+ 2026-04-12T14:34:16.985Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html
- 2026-04-10T21:12:42.838Z
+ 2026-04-12T14:34:17.417Zhttps://docs.axolotl.ai/docs/api/utils.data.streaming.html
- 2026-04-10T21:12:43.343Z
+ 2026-04-12T14:34:17.925Zhttps://docs.axolotl.ai/docs/api/cli.cloud.modal_.html
- 2026-04-10T21:12:42.473Z
+ 2026-04-12T14:34:17.048Zhttps://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html
- 2026-04-10T21:12:43.770Z
+ 2026-04-12T14:34:18.356Zhttps://docs.axolotl.ai/docs/api/cli.config.html
- 2026-04-10T21:12:42.388Z
+ 2026-04-12T14:34:16.961Zhttps://docs.axolotl.ai/docs/api/monkeypatch.multipack.html
- 2026-04-10T21:12:43.076Z
+ 2026-04-12T14:34:17.654Zhttps://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html
- 2026-04-10T21:12:43.689Z
+ 2026-04-12T14:34:18.274Zhttps://docs.axolotl.ai/docs/api/core.trainers.trl.html
- 2026-04-10T21:12:42.562Z
+ 2026-04-12T14:34:17.138Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html
- 2026-04-10T21:12:42.780Z
+ 2026-04-12T14:34:17.358Zhttps://docs.axolotl.ai/docs/api/core.builders.rl.html
- 2026-04-10T21:12:42.193Z
+ 2026-04-12T14:34:16.766Zhttps://docs.axolotl.ai/docs/api/utils.schemas.trl.html
- 2026-04-10T21:12:43.451Z
+ 2026-04-12T14:34:18.035Zhttps://docs.axolotl.ai/docs/api/utils.collators.batching.html
- 2026-04-10T21:12:43.760Z
+ 2026-04-12T14:34:18.345Zhttps://docs.axolotl.ai/docs/api/utils.model_shard_quant.html
- 2026-04-10T21:12:43.222Z
+ 2026-04-12T14:34:17.803Zhttps://docs.axolotl.ai/docs/api/integrations.kd.trainer.html
- 2026-04-10T21:12:43.698Z
+ 2026-04-12T14:34:18.284Zhttps://docs.axolotl.ai/docs/api/utils.schemas.peft.html
- 2026-04-10T21:12:43.447Z
+ 2026-04-12T14:34:18.031Zhttps://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html
- 2026-04-10T21:12:42.717Z
+ 2026-04-12T14:34:17.295Zhttps://docs.axolotl.ai/docs/api/cli.vllm_serve.html
- 2026-04-10T21:12:42.461Z
+ 2026-04-12T14:34:17.035Zhttps://docs.axolotl.ai/docs/api/utils.quantization.html
- 2026-04-10T21:12:43.375Z
+ 2026-04-12T14:34:17.957Zhttps://docs.axolotl.ai/docs/api/utils.collators.mamba.html
- 2026-04-10T21:12:43.764Z
+ 2026-04-12T14:34:18.350Zhttps://docs.axolotl.ai/docs/api/kernels.geglu.html
- 2026-04-10T21:12:43.034Z
+ 2026-04-12T14:34:17.612Zhttps://docs.axolotl.ai/docs/api/core.trainers.utils.html
- 2026-04-10T21:12:42.613Z
+ 2026-04-12T14:34:17.189Zhttps://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html
- 2026-04-10T21:12:43.070Z
+ 2026-04-12T14:34:17.649Zhttps://docs.axolotl.ai/docs/api/common.architectures.html
- 2026-04-10T21:12:43.713Z
+ 2026-04-12T14:34:18.299Zhttps://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html
- 2026-04-10T21:12:42.436Z
+ 2026-04-12T14:34:17.010Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html
- 2026-04-10T21:12:42.823Z
+ 2026-04-12T14:34:17.401Zhttps://docs.axolotl.ai/docs/api/loaders.adapter.html
- 2026-04-10T21:12:42.644Z
+ 2026-04-12T14:34:17.221Zhttps://docs.axolotl.ai/docs/api/cli.utils.fetch.html
- 2026-04-10T21:12:42.496Z
+ 2026-04-12T14:34:17.071Zhttps://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
- 2026-04-10T21:12:43.135Z
+ 2026-04-12T14:34:17.714Zhttps://docs.axolotl.ai/docs/api/cli.main.html
- 2026-04-10T21:12:42.308Z
+ 2026-04-12T14:34:16.881Zhttps://docs.axolotl.ai/docs/api/utils.schedulers.html
- 2026-04-10T21:12:43.299Z
+ 2026-04-12T14:34:17.881Zhttps://docs.axolotl.ai/docs/api/utils.samplers.multipack.html
- 2026-04-10T21:12:43.820Z
+ 2026-04-12T14:34:18.407Zhttps://docs.axolotl.ai/docs/api/core.chat.format.shared.html
- 2026-04-10T21:12:42.244Z
+ 2026-04-12T14:34:16.817Zhttps://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html
- 2026-04-10T21:12:43.458Z
+ 2026-04-12T14:34:18.042Zhttps://docs.axolotl.ai/docs/multimodal.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/input_output.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.596Zhttps://docs.axolotl.ai/docs/multi-gpu.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/lora_optims.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.596Zhttps://docs.axolotl.ai/docs/telemetry.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/batch_vs_grad.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/custom_integrations.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/fsdp_qlora.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/nccl.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/vllm_serving.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/attention.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/multipack.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/torchao.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/nd_parallelism.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/mac.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/reward_modelling.html
- 2026-04-10T21:09:28.601Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/agents/model_architectures.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/agents/grpo.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/agents/pretraining.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/agents/new_model_support.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/optimizers.html
- 2026-04-10T21:09:28.601Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html
- 2026-04-10T21:09:28.609Z
+ 2026-04-12T14:30:57.604Zhttps://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html
- 2026-04-10T21:09:28.647Z
+ 2026-04-12T14:30:57.635Zhttps://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
- 2026-04-10T21:09:28.648Z
+ 2026-04-12T14:30:57.636Zhttps://docs.axolotl.ai/docs/installation.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.596Zhttps://docs.axolotl.ai/docs/ebft.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/agents/reward_modelling.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/agents/sft.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/agents/preference_tuning.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/mixed_precision.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/docker.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/grpo.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/streaming.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/choosing_method.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/dataset_loading.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/sequence_parallelism.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/optimizations.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/multi-node.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/lr_groups.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/quantize.html
- 2026-04-10T21:09:28.601Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/inference.html
- 2026-04-10T21:09:28.600Z
+ 2026-04-12T14:30:57.596Zhttps://docs.axolotl.ai/docs/ray-integration.html
- 2026-04-10T21:09:28.601Z
+ 2026-04-12T14:30:57.597Zhttps://docs.axolotl.ai/docs/amd_hpc.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/unsloth.html
- 2026-04-10T21:09:28.602Z
+ 2026-04-12T14:30:57.599Zhttps://docs.axolotl.ai/docs/getting-started.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/api/utils.schemas.enums.html
- 2026-04-10T21:12:43.489Z
+ 2026-04-12T14:34:18.073Zhttps://docs.axolotl.ai/docs/api/core.builders.base.html
- 2026-04-10T21:12:42.180Z
+ 2026-04-12T14:34:16.754Zhttps://docs.axolotl.ai/docs/api/logging_config.html
- 2026-04-10T21:12:42.173Z
+ 2026-04-12T14:34:16.746Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html
- 2026-04-10T21:12:42.782Z
+ 2026-04-12T14:34:17.360Zhttps://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html
- 2026-04-10T21:12:43.839Z
+ 2026-04-12T14:34:18.427Zhttps://docs.axolotl.ai/docs/api/cli.utils.html
- 2026-04-10T21:12:42.475Z
+ 2026-04-12T14:34:17.050Zhttps://docs.axolotl.ai/docs/api/loaders.patch_manager.html
- 2026-04-10T21:12:42.666Z
+ 2026-04-12T14:34:17.243Zhttps://docs.axolotl.ai/docs/api/core.datasets.chat.html
- 2026-04-10T21:12:42.251Z
+ 2026-04-12T14:34:16.824Zhttps://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html
- 2026-04-10T21:12:42.688Z
+ 2026-04-12T14:34:17.265Zhttps://docs.axolotl.ai/docs/api/common.datasets.html
- 2026-04-10T21:12:43.733Z
+ 2026-04-12T14:34:18.319Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html
- 2026-04-10T21:12:42.932Z
+ 2026-04-12T14:34:17.512Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html
- 2026-04-10T21:12:42.807Z
+ 2026-04-12T14:34:17.385Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html
- 2026-04-10T21:12:42.762Z
+ 2026-04-12T14:34:17.340Zhttps://docs.axolotl.ai/docs/api/kernels.utils.html
- 2026-04-10T21:12:43.063Z
+ 2026-04-12T14:34:17.642Zhttps://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html
- 2026-04-10T21:12:43.074Z
+ 2026-04-12T14:34:17.652Zhttps://docs.axolotl.ai/docs/api/utils.schemas.config.html
- 2026-04-10T21:12:43.394Z
+ 2026-04-12T14:34:17.977Zhttps://docs.axolotl.ai/docs/api/utils.dict.html
- 2026-04-10T21:12:43.331Z
+ 2026-04-12T14:34:17.914Zhttps://docs.axolotl.ai/docs/api/loaders.constants.html
- 2026-04-10T21:12:42.668Z
+ 2026-04-12T14:34:17.245Zhttps://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html
- 2026-04-10T21:12:43.160Z
+ 2026-04-12T14:34:17.740Zhttps://docs.axolotl.ai/docs/api/kernels.lora.html
- 2026-04-10T21:12:43.020Z
+ 2026-04-12T14:34:17.599Zhttps://docs.axolotl.ai/docs/api/datasets.html
- 2026-04-10T21:12:42.092Z
+ 2026-04-12T14:34:16.664Zhttps://docs.axolotl.ai/docs/api/common.const.html
- 2026-04-10T21:12:43.715Z
+ 2026-04-12T14:34:18.300Zhttps://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html
- 2026-04-10T21:12:42.596Z
+ 2026-04-12T14:34:17.172Zhttps://docs.axolotl.ai/docs/api/kernels.swiglu.html
- 2026-04-10T21:12:43.047Z
+ 2026-04-12T14:34:17.625Zhttps://docs.axolotl.ai/docs/api/cli.utils.load.html
- 2026-04-10T21:12:42.503Z
+ 2026-04-12T14:34:17.079Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html
- 2026-04-10T21:12:42.911Z
+ 2026-04-12T14:34:17.491Zhttps://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html
- 2026-04-10T21:12:42.394Z
+ 2026-04-12T14:34:16.967Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html
- 2026-04-10T21:12:42.844Z
+ 2026-04-12T14:34:17.423Zhttps://docs.axolotl.ai/docs/api/utils.freeze.html
- 2026-04-10T21:12:43.239Z
+ 2026-04-12T14:34:17.820Zhttps://docs.axolotl.ai/docs/api/cli.train.html
- 2026-04-10T21:12:42.319Z
+ 2026-04-12T14:34:16.891Zhttps://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html
- 2026-04-10T21:12:43.735Z
+ 2026-04-12T14:34:18.321Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html
- 2026-04-10T21:12:42.797Z
+ 2026-04-12T14:34:17.375Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html
- 2026-04-10T21:12:42.934Z
+ 2026-04-12T14:34:17.514Zhttps://docs.axolotl.ai/docs/api/integrations.base.html
- 2026-04-10T21:12:43.682Z
+ 2026-04-12T14:34:18.269Zhttps://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html
- 2026-04-10T21:12:43.828Z
+ 2026-04-12T14:34:18.416Zhttps://docs.axolotl.ai/docs/api/loaders.tokenizer.html
- 2026-04-10T21:12:42.635Z
+ 2026-04-12T14:34:17.212Zhttps://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html
- 2026-04-10T21:12:42.261Z
+ 2026-04-12T14:34:16.833Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html
- 2026-04-10T21:12:42.879Z
+ 2026-04-12T14:34:17.458Zhttps://docs.axolotl.ai/docs/api/loaders.processor.html
- 2026-04-10T21:12:42.636Z
+ 2026-04-12T14:34:17.214Zhttps://docs.axolotl.ai/docs/api/utils.tokenization.html
- 2026-04-10T21:12:43.207Z
+ 2026-04-12T14:34:17.788Zhttps://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html
- 2026-04-10T21:12:43.139Z
+ 2026-04-12T14:34:17.718Zhttps://docs.axolotl.ai/docs/api/utils.data.sft.html
- 2026-04-10T21:12:43.351Z
+ 2026-04-12T14:34:17.933Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html
- 2026-04-10T21:12:42.910Z
+ 2026-04-12T14:34:17.489Zhttps://docs.axolotl.ai/docs/api/integrations.spectrum.args.html
- 2026-04-10T21:12:43.711Z
+ 2026-04-12T14:34:18.297Zhttps://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html
- 2026-04-10T21:12:43.688Z
+ 2026-04-12T14:34:18.273Zhttps://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html
- 2026-04-10T21:12:43.162Z
+ 2026-04-12T14:34:17.742Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html
- 2026-04-10T21:12:42.893Z
+ 2026-04-12T14:34:17.472Zhttps://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
- 2026-04-10T21:12:43.166Z
+ 2026-04-12T14:34:17.746Zhttps://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html
- 2026-04-10T21:12:43.072Z
+ 2026-04-12T14:34:17.650Zhttps://docs.axolotl.ai/docs/api/utils.trainer.html
- 2026-04-10T21:12:43.261Z
+ 2026-04-12T14:34:17.842Zhttps://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html
- 2026-04-10T21:12:43.127Z
+ 2026-04-12T14:34:17.707Zhttps://docs.axolotl.ai/docs/api/core.builders.causal.html
- 2026-04-10T21:12:42.187Z
+ 2026-04-12T14:34:16.760Zhttps://docs.axolotl.ai/docs/api/cli.cloud.base.html
- 2026-04-10T21:12:42.465Z
+ 2026-04-12T14:34:17.039Zhttps://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html
- 2026-04-10T21:12:42.679Z
+ 2026-04-12T14:34:17.257Zhttps://docs.axolotl.ai/docs/api/index.html
- 2026-04-10T21:12:41.994Z
+ 2026-04-12T14:34:16.564Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.base.html
- 2026-04-10T21:12:42.719Z
+ 2026-04-12T14:34:17.296Zhttps://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html
- 2026-04-10T21:12:43.149Z
+ 2026-04-12T14:34:17.728Zhttps://docs.axolotl.ai/docs/api/utils.schemas.integrations.html
- 2026-04-10T21:12:43.478Z
+ 2026-04-12T14:34:18.062Zhttps://docs.axolotl.ai/docs/api/core.training_args.html
- 2026-04-10T21:12:42.209Z
+ 2026-04-12T14:34:16.782Zhttps://docs.axolotl.ai/docs/api/cli.utils.sweeps.html
- 2026-04-10T21:12:42.510Z
+ 2026-04-12T14:34:17.086Zhttps://docs.axolotl.ai/docs/api/cli.art.html
- 2026-04-10T21:12:42.358Z
+ 2026-04-12T14:34:16.931Zhttps://docs.axolotl.ai/docs/api/monkeypatch.relora.html
- 2026-04-10T21:12:43.080Z
+ 2026-04-12T14:34:17.659Zhttps://docs.axolotl.ai/docs/api/prompt_tokenizers.html
- 2026-04-10T21:12:42.162Z
+ 2026-04-12T14:34:16.734Zhttps://docs.axolotl.ai/docs/api/cli.args.html
- 2026-04-10T21:12:42.354Z
+ 2026-04-12T14:34:16.927Zhttps://docs.axolotl.ai/docs/api/utils.collators.core.html
- 2026-04-10T21:12:43.737Z
+ 2026-04-12T14:34:18.323Zhttps://docs.axolotl.ai/docs/api/utils.distributed.html
- 2026-04-10T21:12:43.325Z
+ 2026-04-12T14:34:17.907Zhttps://docs.axolotl.ai/docs/api/cli.evaluate.html
- 2026-04-10T21:12:42.329Z
+ 2026-04-12T14:34:16.902Zhttps://docs.axolotl.ai/docs/api/evaluate.html
- 2026-04-10T21:12:42.085Z
+ 2026-04-12T14:34:16.656Zhttps://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html
- 2026-04-10T21:12:42.852Z
+ 2026-04-12T14:34:17.431Zhttps://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html
- 2026-04-10T21:12:43.835Z
+ 2026-04-12T14:34:18.422Zhttps://docs.axolotl.ai/docs/api/cli.utils.args.html
- 2026-04-10T21:12:42.489Z
+ 2026-04-12T14:34:17.065Zhttps://docs.axolotl.ai/docs/api/utils.schemas.training.html
- 2026-04-10T21:12:43.412Z
+ 2026-04-12T14:34:17.994Zhttps://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html
- 2026-04-10T21:12:42.675Z
+ 2026-04-12T14:34:17.253Zhttps://docs.axolotl.ai/docs/api/cli.utils.train.html
- 2026-04-10T21:12:42.525Z
+ 2026-04-12T14:34:17.101Zhttps://docs.axolotl.ai/docs/faq.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/dataset_preprocessing.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/dataset-formats/inst_tune.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/dataset-formats/pretraining.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/dataset-formats/index.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/cli.html
- 2026-04-10T21:09:28.596Z
+ 2026-04-12T14:30:57.592Zhttps://docs.axolotl.ai/docs/config-reference.html
- 2026-04-10T21:13:05.688Z
+ 2026-04-12T14:34:41.562Zhttps://docs.axolotl.ai/docs/debugging.html
- 2026-04-10T21:09:28.597Z
+ 2026-04-12T14:30:57.593Zhttps://docs.axolotl.ai/docs/models/magistral/vision.html
- 2026-04-10T21:13:06.629Z
+ 2026-04-12T14:34:42.451Zhttps://docs.axolotl.ai/docs/models/trinity.html
- 2026-04-10T21:13:06.626Z
+ 2026-04-12T14:34:42.448Zhttps://docs.axolotl.ai/docs/models/gpt-oss.html
- 2026-04-10T21:13:06.633Z
+ 2026-04-12T14:34:42.454Zhttps://docs.axolotl.ai/docs/models/LiquidAI.html
- 2026-04-10T21:13:06.635Z
+ 2026-04-12T14:34:42.456Zhttps://docs.axolotl.ai/docs/models/granite4.html
- 2026-04-10T21:13:06.634Z
+ 2026-04-12T14:34:42.456Zhttps://docs.axolotl.ai/docs/models/voxtral.html
- 2026-04-10T21:13:06.630Z
+ 2026-04-12T14:34:42.452Zhttps://docs.axolotl.ai/docs/models/mistral-small.html
- 2026-04-10T21:13:06.629Z
+ 2026-04-12T14:34:42.451Zhttps://docs.axolotl.ai/docs/models/llama-4.html
- 2026-04-10T21:13:06.631Z
+ 2026-04-12T14:34:42.452Zhttps://docs.axolotl.ai/docs/models/llama-2.html
- 2026-04-10T21:13:06.631Z
+ 2026-04-12T14:34:42.453Zhttps://docs.axolotl.ai/docs/models/jamba.html
- 2026-04-10T21:13:06.635Z
+ 2026-04-12T14:34:42.457Zhttps://docs.axolotl.ai/docs/models/ministral3/think.html
- 2026-04-10T21:13:06.627Z
+ 2026-04-12T14:34:42.449Zhttps://docs.axolotl.ai/docs/models/orpheus.html
- 2026-04-10T21:13:06.636Z
+ 2026-04-12T14:34:42.457Zhttps://docs.axolotl.ai/docs/models/index.html
- 2026-04-10T21:13:06.636Z
+ 2026-04-12T14:34:42.457Zhttps://docs.axolotl.ai/docs/models/olmo3.html
- 2026-04-10T21:13:06.625Z
+ 2026-04-12T14:34:42.447Zhttps://docs.axolotl.ai/docs/models/magistral.html
- 2026-04-10T21:13:06.628Z
+ 2026-04-12T14:34:42.450Zhttps://docs.axolotl.ai/docs/models/ministral3.html
- 2026-04-10T21:13:06.627Z
+ 2026-04-12T14:34:42.449Zhttps://docs.axolotl.ai/docs/models/plano.html
- 2026-04-10T21:13:06.624Z
+ 2026-04-12T14:34:42.446Zhttps://docs.axolotl.ai/index.html
- 2026-04-10T21:09:28.636Z
+ 2026-04-12T14:30:57.627Z