diff --git a/.nojekyll b/.nojekyll index 35b5d1540..2aeeeeec3 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -b8caf314 \ No newline at end of file +0c5f4db8 \ No newline at end of file diff --git a/docs/api/loaders.tokenizer.html b/docs/api/loaders.tokenizer.html index 5331d7f70..8e88d0a91 100644 --- a/docs/api/loaders.tokenizer.html +++ b/docs/api/loaders.tokenizer.html @@ -800,7 +800,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); tokenizer_path, token_mappings, output_dir, -) + revision='main', +)
Modify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.
This only works with reserved tokens that were added to the tokenizer, not tokens @@ -809,10 +810,10 @@ already part of the vocab.
| Directory to save the modified tokenizer | required | ||
| revision | +str | +Model revision/branch/tag/commit to load from (HF Hub) | +'main' |
+
utils.trainer.drop_long_seq(
+
+filter_sequences_by_length
+utils.trainer.filter_sequences_by_length(
sample,
sequence_len=2048,
min_sequence_len=2,
raise_on_drop=False,
)
-Drop samples whose sequence length is either too long (> sequence_len)
-or too short (< min_sequence_len).
+Filter sequences outside valid length range [min_sequence_len, sequence_len].
+Drops samples that are either too short (< min_sequence_len) or too long (> sequence_len).
Works for both single-example (list[int]) or batched (list[list[int]]).
If raise_on_drop is set, the code raises a ValueError if a sample is
encountered that is too long and would have been dropped.
diff --git a/docs/config-reference.html b/docs/config-reference.html
index 06cbb04aa..c9732ef6f 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -1465,767 +1465,780 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
ddp_broadcast_buffers: bool | None
ddp_find_unused_parameters: bool | None
-# Approximate number of predictions sent to wandb depending on batch size. Enabled above
-# 0. Default is 0
-eval_table_size: int | None
-# Total number of tokens generated for predictions sent to wandb. Default is 128
-eval_max_new_tokens: int | None
-# Whether to run causal language model evaluation for metrics in
-# `eval_causal_lm_metrics`
-do_causal_lm_eval: bool | None
-# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',
-# 'chrf', 'perplexity']
-eval_causal_lm_metrics: list[str] | None
-do_bench_eval: bool | None
-bench_dataset: str | None
-bench_split: str | None
-metric_for_best_model: str | None
-greater_is_better: bool | None
-
-# High loss value, indicating the learning has broken down (a good estimate is ~2 times
-# the loss at the start of training)
-loss_watchdog_threshold: float | None
-# Number of high-loss steps in a row before the trainer aborts (default: 3)
-loss_watchdog_patience: int | None
-
-# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before
-# evaluations. Default is 0 (disabled).
-gc_steps: int | None
-
-# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.
-# require >=ampere
-bf16: Literal['auto'] | bool | None = auto
-# Use CUDA fp16
-fp16: bool | None
-# Enable FP8 mixed precision training using TorchAO. Best used in combination with
-# torch.compile.
-fp8: bool | None
-# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training
-# speed by 10-15% when FSDP is enabled.
-fp8_enable_fsdp_float8_all_gather: bool | None
-# No AMP (automatic mixed precision) - require >=ampere
-bfloat16: bool | None
-# No AMP (automatic mixed precision)
-float16: bool | None
-# Use CUDA tf32 - require >=ampere
-tf32: bool | None
-float32: bool | None
-
-# Whether to use gradient checkpointing. Available options are: true, false, 'offload',
-# 'offload_disk'.
-# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
-gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False
-# Additional kwargs to pass to the trainer for gradient checkpointing
-gradient_checkpointing_kwargs: dict[str, Any] | None
-# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.
-activation_offloading: Literal['legacy', 'disk'] | bool | None = False
-
-# List of regex patterns for parameter names to keep unfrozen. All other parameters will
-# be frozen via requires_grad=False. Note: range-based patterns (e.g.
-# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so
-# weight decay will still apply to the frozen portion and optimizer states are allocated
-# for the full parameter.
-unfrozen_parameters: list[str] | None
-
-# The maximum length of an input to train with, this should typically be less than 2048
-# as most models have a token/context limit of 2048
-sequence_len: int = 512
-# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;
-# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to
-# 'drop' for backward compatibility.
-excess_length_strategy: Literal['drop', 'truncate', 'raise'] | None
-# The maximum length of an input for evaluation. If not specified, defaults to
-# sequence_len
-eval_sequence_len: int | None
-min_sample_len: int | None
-# maximum prompt length for RL training
-max_prompt_len: int | None
-# Use efficient multi-packing with block diagonal attention and per sequence
-# position_ids. Recommend set to 'true'
-sample_packing: bool | None
-# The number of samples packed at a time. Increasing the following values helps with
-# packing, but usually only slightly (<%1.)
-sample_packing_group_size: int | None = 100000
-# The number of samples which can be packed into one sequence. Increase if using a large
-# sequence_len with many short samples.
-sample_packing_bin_size: int | None = 200
-# Whether to pack samples sequentially
-sample_packing_sequentially: bool | None
-# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or
-# 'forkserver'
-sample_packing_mp_start_method: str | None
-# Set to 'false' if getting errors during eval with sample_packing on
-eval_sample_packing: bool | None
-# Pad inputs so each step uses constant sized buffers. This will reduce memory
-# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to
-# True if `sample_packing` enabled
-pad_to_sequence_len: bool | None
-# Whether to use sequential sampling for curriculum learning
-curriculum_sampling: bool | None
-multipack_real_batches: bool | None
-
-# Use batch flattening for speedups when not using sample_packing
-batch_flattening: Literal['auto'] | bool | None
+# Whether to run causal language model evaluation for metrics in
+# `eval_causal_lm_metrics`
+do_causal_lm_eval: bool | None
+# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',
+# 'chrf', 'perplexity']
+eval_causal_lm_metrics: list[str] | None
+do_bench_eval: bool | None
+bench_dataset: str | None
+bench_split: str | None
+metric_for_best_model: str | None
+greater_is_better: bool | None
+
+# High loss value, indicating the learning has broken down (a good estimate is ~2 times
+# the loss at the start of training)
+loss_watchdog_threshold: float | None
+# Number of high-loss steps in a row before the trainer aborts (default: 3)
+loss_watchdog_patience: int | None
+
+# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before
+# evaluations. Default is 0 (disabled).
+gc_steps: int | None
+
+# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.
+# require >=ampere
+bf16: Literal['auto'] | bool | None = auto
+# Use CUDA fp16
+fp16: bool | None
+# Enable FP8 mixed precision training using TorchAO. Best used in combination with
+# torch.compile.
+fp8: bool | None
+# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training
+# speed by 10-15% when FSDP is enabled.
+fp8_enable_fsdp_float8_all_gather: bool | None
+# No AMP (automatic mixed precision) - require >=ampere
+bfloat16: bool | None
+# No AMP (automatic mixed precision)
+float16: bool | None
+# Use CUDA tf32 - require >=ampere
+tf32: bool | None
+float32: bool | None
+
+# Whether to use gradient checkpointing. Available options are: true, false, 'offload',
+# 'offload_disk'.
+# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False
+# Additional kwargs to pass to the trainer for gradient checkpointing
+gradient_checkpointing_kwargs: dict[str, Any] | None
+# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.
+activation_offloading: Literal['legacy', 'disk'] | bool | None = False
+
+# List of regex patterns for parameter names to keep unfrozen. All other parameters will
+# be frozen via requires_grad=False. Note: range-based patterns (e.g.
+# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so
+# weight decay will still apply to the frozen portion and optimizer states are allocated
+# for the full parameter.
+unfrozen_parameters: list[str] | None
+
+# The maximum length of an input to train with, this should typically be less than 2048
+# as most models have a token/context limit of 2048
+sequence_len: int = 512
+# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;
+# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to
+# 'drop' for backward compatibility.
+excess_length_strategy: Literal['drop', 'truncate', 'raise'] | None
+# The maximum length of an input for evaluation. If not specified, defaults to
+# sequence_len
+eval_sequence_len: int | None
+min_sample_len: int | None
+# maximum prompt length for RL training
+max_prompt_len: int | None
+# Use efficient multi-packing with block diagonal attention and per sequence
+# position_ids. Recommend set to 'true'
+sample_packing: bool | None
+# The number of samples packed at a time. Increasing the following values helps with
+# packing, but usually only slightly (<%1.)
+sample_packing_group_size: int | None = 100000
+# The number of samples which can be packed into one sequence. Increase if using a large
+# sequence_len with many short samples.
+sample_packing_bin_size: int | None = 200
+# Whether to pack samples sequentially
+sample_packing_sequentially: bool | None
+# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or
+# 'forkserver'
+sample_packing_mp_start_method: str | None
+# Set to 'false' if getting errors during eval with sample_packing on
+eval_sample_packing: bool | None
+# Pad inputs so each step uses constant sized buffers. This will reduce memory
+# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to
+# True if `sample_packing` enabled
+pad_to_sequence_len: bool | None
+# Whether to use sequential sampling for curriculum learning
+curriculum_sampling: bool | None
+multipack_real_batches: bool | None
+
+# Use batch flattening for speedups when not using sample_packing
+batch_flattening: Literal['auto'] | bool | None
+
+use_pose: bool | None
+pose_split_on_token_ids: list[int] | None
+pose_max_context_len: int | None
+pose_num_chunks: int | None
-use_pose: bool | None
-pose_split_on_token_ids: list[int] | None
-pose_max_context_len: int | None
-pose_num_chunks: int | None
-
-pretrain_multipack_buffer_size: int | None
-# whether to prevent cross attention for packed sequences during pretraining
-pretrain_multipack_attn: bool | None = True
-# whether to concatenate samples during pretraining
-pretraining_sample_concatenation: bool | None
+pretrain_multipack_buffer_size: int | None
+# whether to prevent cross attention for packed sequences during pretraining
+pretrain_multipack_attn: bool | None = True
+# whether to concatenate samples during pretraining
+pretraining_sample_concatenation: bool | None
+
+# Use streaming mode for loading datasets
+streaming: bool | None
+# Buffer size for multipack streaming datasets
+streaming_multipack_buffer_size: int | None = 10000
-# Use streaming mode for loading datasets
-streaming: bool | None
-# Buffer size for multipack streaming datasets
-streaming_multipack_buffer_size: int | None = 10000
-
-# Whether to use xformers attention patch https://github.com/facebookresearch/xformers
-xformers_attention: bool | None
-# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/
-# torch.nn.functional.scaled_dot_product_attention.html
-sdp_attention: bool | None
-# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
-s2_attention: bool | None
-flex_attention: bool | None
-flex_attn_compile_kwargs: dict[str, Any] | None
-# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention
-flash_attention: bool | None
-# Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_cross_entropy: bool | None
-# Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_rms_norm: bool | None
-# Whether to fuse part of the MLP into a single operation
-flash_attn_fuse_mlp: bool | None
-# Whether to use bettertransformers
-flash_optimum: bool | None
-# Whether to use SageAttention https://github.com/thu-ml/SageAttention
-sage_attention: bool | None
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers
+xformers_attention: bool | None
+# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/
+# torch.nn.functional.scaled_dot_product_attention.html
+sdp_attention: bool | None
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention: bool | None
+flex_attention: bool | None
+flex_attn_compile_kwargs: dict[str, Any] | None
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention
+flash_attention: bool | None
+# Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_cross_entropy: bool | None
+# Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_rms_norm: bool | None
+# Whether to fuse part of the MLP into a single operation
+flash_attn_fuse_mlp: bool | None
+# Whether to use bettertransformers
+flash_optimum: bool | None
+# Whether to use SageAttention https://github.com/thu-ml/SageAttention
+sage_attention: bool | None
+
+eager_attention: bool | None
+
+# Specify a custom attention implementation, used mostly for kernels.
+attn_implementation: str | None
-eager_attention: bool | None
-
-# Specify a custom attention implementation, used mostly for kernels.
-attn_implementation: str | None
-
-# Which experts implementation to use for MoE models,
-experts_implementation: str | None
-
-# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399
-scaling_softmax: bool | None
-# Scaling factor for SSMax attention. Default is 0.43
-scaling_softmax_factor: float | None
-# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better
-# length generalization.
-scaling_softmax_bias: float | None
-
-unsloth_cross_entropy_loss: bool | None
-unsloth_lora_mlp: bool | None
-unsloth_lora_qkv: bool | None
-unsloth_lora_o: bool | None
-unsloth_rms_norm: bool | None
-unsloth_rope: bool | None
-
-# Apply custom LoRA autograd functions and activation function Triton kernels for speed
-# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
-lora_mlp_kernel: bool | None
-# Apply custom LoRA autograd functions and activation function Triton kernels for speed
-# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
-lora_qkv_kernel: bool | None
-# Apply custom LoRA autograd functions and activation function Triton kernels for speed
-# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
-lora_o_kernel: bool | None
-
-# Whether to use chunked cross entropy loss for memory efficiency
-chunked_cross_entropy: bool | None
-# Number of chunks to use for chunked cross entropy loss
-chunked_cross_entropy_num_chunks: int | None
-# Enable Entropy-Aware Focal Training loss (EAFT)
-use_eaft: bool | None
-# Exponent for entropy weighting in EAFT (default: 1.0)
-eaft_alpha: float | None = 1.0
-# Number of top logits for entropy approximation (default: 20)
-eaft_k: int | None = 20
-
-# Whether to use ALST tiled mlp for memory efficient long context
-tiled_mlp: bool | None
-
-# Number of shards to use for ALST tiled mlp. If unset, it will be set based on
-# seqlen/hidden_size
-tiled_mlp_num_shards: int | None
-
-# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on
-# llama.
-tiled_mlp_use_original_mlp: bool | None = True
-
-llama4_linearized_experts: bool | None
-
-# Deepspeed config path. e.g., deepspeed_configs/zero3.json
-deepspeed: str | dict[str, Any] | None
-# Whether to use deepcompile for faster training with deepspeed
-deepcompile: bool | None
-# FSDP configuration
-fsdp: list[str] | None
-
-# FSDP configuration options
-fsdp_config: FSDPConfig | None
- # For FSDPConfig:
- # FSDP version
- fsdp_version: int | None
- # Enable activation checkpointing to reduce memory usage during forward passes
- activation_checkpointing: bool | None
- # Offload parameters to CPU to reduce GPU memory usage
- offload_params: bool | None
- # Synchronize module states across all processes
- sync_module_states: bool | None
- # Enable CPU RAM efficient loading to reduce memory usage during model loading
- cpu_ram_efficient_loading: bool | None
- # Disabling this enables swap memory usage for resource-constrained setups when
- # offload_params is enabled.
- cpu_offload_pin_memory: bool | None
- # Use original parameters instead of flattened parameters
- use_orig_params: bool | None
+# Which experts implementation to use for MoE models,
+experts_implementation: str | None
+
+# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399
+scaling_softmax: bool | None
+# Scaling factor for SSMax attention. Default is 0.43
+scaling_softmax_factor: float | None
+# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better
+# length generalization.
+scaling_softmax_bias: float | None
+
+unsloth_cross_entropy_loss: bool | None
+unsloth_lora_mlp: bool | None
+unsloth_lora_qkv: bool | None
+unsloth_lora_o: bool | None
+unsloth_rms_norm: bool | None
+unsloth_rope: bool | None
+
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_mlp_kernel: bool | None
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_qkv_kernel: bool | None
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_o_kernel: bool | None
+
+# Whether to use chunked cross entropy loss for memory efficiency
+chunked_cross_entropy: bool | None
+# Number of chunks to use for chunked cross entropy loss
+chunked_cross_entropy_num_chunks: int | None
+# Enable Entropy-Aware Focal Training loss (EAFT)
+use_eaft: bool | None
+# Exponent for entropy weighting in EAFT (default: 1.0)
+eaft_alpha: float | None = 1.0
+# Number of top logits for entropy approximation (default: 20)
+eaft_k: int | None = 20
+
+# Whether to use ALST tiled mlp for memory efficient long context
+tiled_mlp: bool | None
+
+# Number of shards to use for ALST tiled mlp. If unset, it will be set based on
+# seqlen/hidden_size
+tiled_mlp_num_shards: int | None
+
+# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on
+# llama.
+tiled_mlp_use_original_mlp: bool | None = True
+
+llama4_linearized_experts: bool | None
+
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
+deepspeed: str | dict[str, Any] | None
+# Whether to use deepcompile for faster training with deepspeed
+deepcompile: bool | None
+# FSDP configuration
+fsdp: list[str] | None
+
+# FSDP configuration options
+fsdp_config: FSDPConfig | None
+ # For FSDPConfig:
+ # FSDP version
+ fsdp_version: int | None
+ # Enable activation checkpointing to reduce memory usage during forward passes
+ activation_checkpointing: bool | None
+ # Offload parameters to CPU to reduce GPU memory usage
+ offload_params: bool | None
+ # Synchronize module states across all processes
+ sync_module_states: bool | None
+ # Enable CPU RAM efficient loading to reduce memory usage during model loading
+ cpu_ram_efficient_loading: bool | None
+ # Disabling this enables swap memory usage for resource-constrained setups when
+ # offload_params is enabled.
+ cpu_offload_pin_memory: bool | None
+ # Use original parameters instead of flattened parameters
+ use_orig_params: bool | None
+
+ # Type of state dict to use for saving/loading checkpoints
+ state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+ # Final state dict type to use after training completion
+ final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
- # Type of state dict to use for saving/loading checkpoints
- state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
- # Final state dict type to use after training completion
- final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+ # Policy for automatically wrapping modules with FSDP
+ auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None
+ # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')
+ transformer_layer_cls_to_wrap: str | None
- # Policy for automatically wrapping modules with FSDP
- auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None
- # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')
- transformer_layer_cls_to_wrap: str | None
+ # Reshard parameters after forward pass to save memory
+ reshard_after_forward: bool | None
+ # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')
+ mixed_precision_policy: str | None
- # Reshard parameters after forward pass to save memory
- reshard_after_forward: bool | None
- # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')
- mixed_precision_policy: str | None
-
-# FSDP version
-fsdp_version: int | None
-fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
-
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for
-# no eval.
-val_set_size: float | None = 0.0
-
-# Number of devices to shard across. If not set, will use all available devices.
-dp_shard_size: int | None
-# Number of devices to replicate across.
-dp_replicate_size: int | None
-# Deprecated: use `context_parallel_size` instead
-sequence_parallel_degree: int | None
-# Set to a divisor of the number of GPUs available to split sequences into chunks of
-# equal size. Use in long context training to prevent OOM when sequences cannot fit into
-# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each
-# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized
-# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more
-# details.
-context_parallel_size: int | None
-# Optional; strides across the key dimension. Larger values use more memory but should
-# make training faster. Must evenly divide the number of KV heads in your model.
-heads_k_stride: int | None
-# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to
-# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing
-# case.
-ring_attn_func: RingAttnFunc | None
-# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.
-tensor_parallel_size: int | None
-
-# Add or change special tokens. If you add tokens here, you don't need to add them to
-# the `tokens` list.
-special_tokens: SpecialTokensConfig | None
- # For SpecialTokensConfig:
- bos_token: str | None
- eos_token: str | None
- pad_token: str | None
- unk_token: str | None
- additional_special_tokens: list[str] | None
-
-# Add extra tokens to the tokenizer
-tokens: list[str] | None
-# Mapping token_id to new_token_string to override reserved added_tokens in the
-# tokenizer. Only works for tokens that are not part of the base vocab (aka are
-# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.
-added_tokens_overrides: dict[int, str] | None
-
-# Whether to use torch.compile and which backend to use. setting to `auto` will enable
-# torch compile when torch>=2.6.0
-torch_compile: Literal['auto'] | bool | None
-# Backend to use for torch.compile
-torch_compile_backend: str | None
-torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None
-
-# Maximum number of iterations to train for. It precedes num_epochs which means that if
-# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>
-# `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-max_steps: int | None
-# Number of warmup steps. Cannot use with warmup_ratio
-warmup_steps: int | None
-# Warmup ratio. Cannot use with warmup_steps
-warmup_ratio: float | None
-# Leave empty to eval at each epoch, integer for every N steps. float for fraction of
-# total steps
-eval_steps: int | float | None
-# Number of times per epoch to run evals, mutually exclusive with eval_steps
-evals_per_epoch: int | None
-# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer
-# from `eval_steps`
-eval_strategy: str | None
-
-# Leave empty to save at each epoch, integer for every N steps. float for fraction of
-# total steps
-save_steps: int | float | None
-# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps
-saves_per_epoch: int | None
-# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better
-# result is achieved, leave empty to infer from `save_steps`
-save_strategy: str | None
-# Checkpoints saved at a time
-save_total_limit: int | None
-# Whether to checkpoint a model after the first step of training. Defaults to False.
-save_first_step: bool | None
-
-# Logging frequency
-logging_steps: int | None
-# Stop training after this many evaluation losses have increased in a row. https://huggi
-# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin
-# gCallback
-early_stopping_patience: int | None
-load_best_model_at_end: bool | None = False
-# Save only the model weights, skipping the optimizer. Using this means you can't resume
-# from checkpoints.
-save_only_model: bool | None = False
-# Use tensorboard for logging
-use_tensorboard: bool | None
-# Enable the pytorch profiler to capture the first N steps of training to the
-# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more
-# information. Snapshots can be visualized @ https://pytorch.org/memory_viz
-profiler_steps: int | None
-# Which step to start the profiler at. Useful for only capturing a few steps mid-run.
-profiler_steps_start: int | None = 0
-# bool of whether to report tokens per second at the end of training. This is not
-# supported with pre-training datasets.
-include_tokens_per_second: bool | None
-# bool of whether to report tokens per second per-gpu during training by measuring
-# throughput of non-padding tokens.
-include_tkps: bool | None = True
-# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to
-# add noise to embeddings. Currently only supported on Llama and Mistral
-neftune_noise_alpha: float | None
-
-# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to
-# `beta` in `ORPOConfig` due to trl mapping.
-orpo_alpha: float | None
-# Weighting of NLL term in loss from RPO paper
-rpo_alpha: float | None
-# Target reward margin for the SimPO loss
-simpo_gamma: float | None
-# Weight of the BC regularizer
-cpo_alpha: float | None
-
-# Factor for desirable loss term in KTO loss
-kto_desirable_weight: float | None
-# Factor for undesirable loss term in KTO loss
-kto_undesirable_weight: float | None
-# The beta parameter for the RL training
-rl_beta: float | None
-
-# Defines the max memory usage per gpu on the system. Passed through to transformers
-# when loading the model.
-max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None
-# Limit the memory for all available GPUs to this amount (if an integer, expressed in
-# gigabytes); default: unset
-gpu_memory_limit: int | str | None
-# Whether to use low_cpu_mem_usage
-low_cpu_mem_usage: bool | None
-
-# The name of the chat template to use for training, following values are supported:
-# tokenizer_default: Uses the chat template that is available in the
-# tokenizer_config.json. If the chat template is not available in the tokenizer, it will
-# raise an error. This is the default value.
-# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
-# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
-# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.
-# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not
-# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.
-# The custom jinja template should be provided in the chat_template_jinja field. The
-# selected chat template will be saved to the tokenizer_config.json for easier
-# inferencing
-chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None
-# Custom jinja template or path to jinja file for chat template. This will be only used
-# if chat_template is set to `jinja` or `null` (in which case chat_template is
-# automatically set to `jinja`). Default is null.
-chat_template_jinja: str | None
-# Additional kwargs to pass to the chat template. This is useful for customizing the
-# chat template. For example, you can pass `thinking=False` to add a generation prompt
-# to the chat template.
-chat_template_kwargs: dict[str, Any] | None
-# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the
-# boundaries between conversation turns. For example: ['/INST', '</s>',
-# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is
-# useful for templates that use multiple delimiter tokens.
-eot_tokens: list[str] | None
-# Changes the default system message. Currently only supports chatml.
-default_system_message: str | None
-
-# Token index or indices to adjust embedding weights to the mean of the other tokens.
-# This is useful when the model has untrained embeddings.
-fix_untrained_tokens: int | list[int] | None
-
-is_preprocess: bool | None
-preprocess_iterable: bool | None
-
-# Total number of tokens - internal use
-total_num_tokens: int | None
-total_supervised_tokens: int | None
-# You can set these packing optimizations AFTER starting a training at least once. The
-# trainer will provide recommended values for these values.
-sample_packing_eff_est: float | None
-axolotl_config_path: str | None
-
-# Internal use only - Used to identify which the model is based on
-is_falcon_derived_model: bool | None
+# FSDP version
+fsdp_version: int | None
+fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+
+# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for
+# no eval.
+val_set_size: float | None = 0.0
+
+# Number of devices to shard across. If not set, will use all available devices.
+dp_shard_size: int | None
+# Number of devices to replicate across.
+dp_replicate_size: int | None
+# Deprecated: use `context_parallel_size` instead
+sequence_parallel_degree: int | None
+# Set to a divisor of the number of GPUs available to split sequences into chunks of
+# equal size. Use in long context training to prevent OOM when sequences cannot fit into
+# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each
+# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized
+# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more
+# details.
+context_parallel_size: int | None
+# Optional; strides across the key dimension. Larger values use more memory but should
+# make training faster. Must evenly divide the number of KV heads in your model.
+heads_k_stride: int | None
+# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to
+# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing
+# case.
+ring_attn_func: RingAttnFunc | None
+# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.
+tensor_parallel_size: int | None
+
+# Add or change special tokens. If you add tokens here, you don't need to add them to
+# the `tokens` list.
+special_tokens: SpecialTokensConfig | None
+ # For SpecialTokensConfig:
+ bos_token: str | None
+ eos_token: str | None
+ pad_token: str | None
+ unk_token: str | None
+ additional_special_tokens: list[str] | None
+
+# Add extra tokens to the tokenizer
+tokens: list[str] | None
+# Mapping token_id to new_token_string to override reserved added_tokens in the
+# tokenizer. Only works for tokens that are not part of the base vocab (aka are
+# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.
+added_tokens_overrides: dict[int, str] | None
+
+# Whether to use torch.compile and which backend to use. setting to `auto` will enable
+# torch compile when torch>=2.6.0
+torch_compile: Literal['auto'] | bool | None
+# Backend to use for torch.compile
+torch_compile_backend: str | None
+torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None
+
+# Maximum number of iterations to train for. It precedes num_epochs which means that if
+# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>
+# `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+max_steps: int | None
+# Number of warmup steps. Cannot use with warmup_ratio
+warmup_steps: int | None
+# Warmup ratio. Cannot use with warmup_steps
+warmup_ratio: float | None
+# Leave empty to eval at each epoch, integer for every N steps. float for fraction of
+# total steps
+eval_steps: int | float | None
+# Number of times per epoch to run evals, mutually exclusive with eval_steps
+evals_per_epoch: int | None
+# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer
+# from `eval_steps`
+eval_strategy: str | None
+
+# Leave empty to save at each epoch, integer for every N steps. float for fraction of
+# total steps
+save_steps: int | float | None
+# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+saves_per_epoch: int | None
+# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better
+# result is achieved, leave empty to infer from `save_steps`
+save_strategy: str | None
+# Checkpoints saved at a time
+save_total_limit: int | None
+# Whether to checkpoint a model after the first step of training. Defaults to False.
+save_first_step: bool | None
+
+# Logging frequency
+logging_steps: int | None
+# Stop training after this many evaluation losses have increased in a row. https://huggi
+# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin
+# gCallback
+early_stopping_patience: int | None
+load_best_model_at_end: bool | None = False
+# Save only the model weights, skipping the optimizer. Using this means you can't resume
+# from checkpoints.
+save_only_model: bool | None = False
+# Use tensorboard for logging
+use_tensorboard: bool | None
+# Enable the pytorch profiler to capture the first N steps of training to the
+# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more
+# information. Snapshots can be visualized @ https://pytorch.org/memory_viz
+profiler_steps: int | None
+# Which step to start the profiler at. Useful for only capturing a few steps mid-run.
+profiler_steps_start: int | None = 0
+# bool of whether to report tokens per second at the end of training. This is not
+# supported with pre-training datasets.
+include_tokens_per_second: bool | None
+# bool of whether to report tokens per second per-gpu during training by measuring
+# throughput of non-padding tokens.
+include_tkps: bool | None = True
+# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to
+# add noise to embeddings. Currently only supported on Llama and Mistral
+neftune_noise_alpha: float | None
+
+# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to
+# `beta` in `ORPOConfig` due to trl mapping.
+orpo_alpha: float | None
+# Weighting of NLL term in loss from RPO paper
+rpo_alpha: float | None
+# Target reward margin for the SimPO loss
+simpo_gamma: float | None
+# Weight of the BC regularizer
+cpo_alpha: float | None
+
+# Factor for desirable loss term in KTO loss
+kto_desirable_weight: float | None
+# Factor for undesirable loss term in KTO loss
+kto_undesirable_weight: float | None
+# The beta parameter for the RL training
+rl_beta: float | None
+
+# Defines the max memory usage per gpu on the system. Passed through to transformers
+# when loading the model.
+max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in
+# gigabytes); default: unset
+gpu_memory_limit: int | str | None
+# Whether to use low_cpu_mem_usage
+low_cpu_mem_usage: bool | None
+
+# The name of the chat template to use for training, following values are supported:
+# tokenizer_default: Uses the chat template that is available in the
+# tokenizer_config.json. If the chat template is not available in the tokenizer, it will
+# raise an error. This is the default value.
+# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
+# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
+# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.
+# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not
+# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.
+# The custom jinja template should be provided in the chat_template_jinja field. The
+# selected chat template will be saved to the tokenizer_config.json for easier
+# inferencing
+chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None
+# Custom jinja template or path to jinja file for chat template. This will be only used
+# if chat_template is set to `jinja` or `null` (in which case chat_template is
+# automatically set to `jinja`). Default is null.
+chat_template_jinja: str | None
+# Additional kwargs to pass to the chat template. This is useful for customizing the
+# chat template. For example, you can pass `thinking=False` to add a generation prompt
+# to the chat template.
+chat_template_kwargs: dict[str, Any] | None
+# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the
+# boundaries between conversation turns. For example: ['/INST', '</s>',
+# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is
+# useful for templates that use multiple delimiter tokens.
+eot_tokens: list[str] | None
+# Changes the default system message. Currently only supports chatml.
+default_system_message: str | None
+
+# Token index or indices to adjust embedding weights to the mean of the other tokens.
+# This is useful when the model has untrained embeddings.
+fix_untrained_tokens: int | list[int] | None
+
+is_preprocess: bool | None
+preprocess_iterable: bool | None
+
+# Total number of tokens - internal use
+total_num_tokens: int | None
+total_supervised_tokens: int | None
+# You can set these packing optimizations AFTER starting a training at least once. The
+# trainer will provide recommended values for these values.
+sample_packing_eff_est: float | None
+axolotl_config_path: str | None
+
+# Internal use only - Used to identify which the model is based on
+is_falcon_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on
+is_llama_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on. Please note that if
+# you set this to true, `padding_side` will be set to 'left' by default
+is_mistral_derived_model: bool | None
# Internal use only - Used to identify which the model is based on
-is_llama_derived_model: bool | None
-# Internal use only - Used to identify which the model is based on. Please note that if
-# you set this to true, `padding_side` will be set to 'left' by default
-is_mistral_derived_model: bool | None
-# Internal use only - Used to identify which the model is based on
-is_qwen_derived_model: bool | None
-
-# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available
-# plugins or doc below for more details.
-# https://docs.axolotl.ai/docs/custom_integrations.html
-plugins: list[str] | None
-
-# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This
-# can also be a relative path to a model on disk
-base_model: str (required)
-# If the base_model repo on hf hub doesn't include configuration .json files, You can
-# set that here, or leave this empty to default to base_model
-base_model_config: str | None
-# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to
-# AutoConfig.
-cls_model_config: str | None
-# Optional tokenizer configuration path in case you want to use a different tokenizer
-# than the one defined in the base model
-tokenizer_config: str | None
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast: bool | None
-# Whether to use the legacy tokenizer setting, defaults to True
-tokenizer_legacy: bool | None
-# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-
-# common tokenizer.
-tokenizer_use_mistral_common: bool | None
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: str | None
-# transformers processor class
-processor_type: str | None
-# Whether to save jinja files for tokenizer, transformers default is True
-tokenizer_save_jinja_files: bool | None = True
-# Trust remote code for untrusted source
-trust_remote_code: bool | None
-
-# Don't move the model to the device before sharding. Set to `false` to revert to legacy
-# behavior.
-experimental_skip_move_to_device: bool | None = True
-
-# Use custom kernels, e.g. MegaBlocks.
-use_kernels: bool | None
-
-# Model loading quantization config
-model_quantization_config: Literal['Mxfp4Config'] | None
-# kwargs for model quantization config
-model_quantization_config_kwargs: dict[str, Any] | None
-
-# Where to save the full-finetuned model to
-output_dir: str = ./model-out
-# push checkpoints to hub
-hub_model_id: str | None
-# how to push checkpoints to hub
-hub_strategy: str | None
-# branch/revision to push to on hub (default: main)
-hub_revision: str | None
-# Whether to save the model using safetensors format. Defaults to True.
-save_safetensors: bool | None = True
+is_qwen_derived_model: bool | None
+
+# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available
+# plugins or doc below for more details.
+# https://docs.axolotl.ai/docs/custom_integrations.html
+plugins: list[str] | None
+# Enable sample generation during training for monitoring
+generate_samples: bool | None = False
+# Number of samples to generate at each interval
+num_generation_samples: int | None = 3
+# Maximum new tokens to generate per sample
+generation_max_new_tokens: int | None = 50
+# Temperature for sample generation (0.0 = greedy)
+generation_temperature: float | None = 0.7
+# Nucleus sampling parameter for generation
+generation_top_p: float | None
+# Top-k sampling parameter for generation
+generation_top_k: int | None
+# Ratio of input to use as prompt (0.0-1.0)
+generation_prompt_ratio: float | None = 0.5
+# Whether to use sampling (vs greedy decoding)
+generation_do_sample: bool | None = True
+
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This
+# can also be a relative path to a model on disk
+base_model: str (required)
+# If the base_model repo on hf hub doesn't include configuration .json files, You can
+# set that here, or leave this empty to default to base_model
+base_model_config: str | None
+# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to
+# AutoConfig.
+cls_model_config: str | None
+# Optional tokenizer configuration path in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config: str | None
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast: bool | None
+# Whether to use the legacy tokenizer setting, defaults to True
+tokenizer_legacy: bool | None
+# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-
+# common tokenizer.
+tokenizer_use_mistral_common: bool | None
+# Corresponding tokenizer for the model AutoTokenizer is a good choice
+tokenizer_type: str | None
+# transformers processor class
+processor_type: str | None
+# Whether to save jinja files for tokenizer, transformers default is True
+tokenizer_save_jinja_files: bool | None = True
+# Trust remote code for untrusted source
+trust_remote_code: bool | None
+
+# Don't move the model to the device before sharding. Set to `false` to revert to legacy
+# behavior.
+experimental_skip_move_to_device: bool | None = True
+
+# Use custom kernels, e.g. MegaBlocks.
+use_kernels: bool | None
+
+# Model loading quantization config
+model_quantization_config: Literal['Mxfp4Config'] | None
+# kwargs for model quantization config
+model_quantization_config_kwargs: dict[str, Any] | None
-# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: bool | None = False
-# Use bitsandbytes 4 bit
-load_in_4bit: bool | None = False
-
-# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all
-# parameters in original model
-adapter: Literal['lora', 'qlora', 'llama-adapter'] | None
-# If you already have a lora model trained that you want to load, put that here. This
-# means after training, if you want to test the model, you should set this to the value
-# of `output_dir`. Note that if you merge an adapter to the base model, a new
-# subdirectory `merged` will be created under the `output_dir`.
-lora_model_dir: str | None
-lora_r: int | None
-lora_alpha: int | None
-lora_fan_in_fan_out: bool | None
-lora_target_modules: str | list[str] | None
-lora_target_parameters: str | list[str] | None
-# If true, will target all linear modules
-lora_target_linear: bool | None
-# If you added new tokens to the tokenizer, you may need to save some LoRA modules
-# because they need to know the new tokens. For LLaMA and Mistral, you need to save
-# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts
-# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-lora_modules_to_save: list[str] | None
-lora_dropout: float | None = 0.0
-# The layer indices to transform, otherwise, apply to all layers
-peft_layers_to_transform: list[int] | None
-peft_layers_pattern: list[str] | None
-
-peft: PeftConfig | None
- # For PeftConfig:
- # Configuration options for loftq initialization for LoRA
- loftq_config: LoftQConfig | None
- # For LoftQConfig:
- # typically 4 bits
- loftq_bits: int = 4
-
-# Whether to use DoRA.
-peft_use_dora: bool | None
-# Whether to use RSLoRA.
-peft_use_rslora: bool | None
-# List of layer indices to replicate.
-peft_layer_replication: list[tuple[int, int]] | None
-# How to initialize LoRA weights. Default to True which is MS original implementation.
-peft_init_lora_weights: bool | str | None
-# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict
-# mapping an embedding layer name to its trainable token indices. See
-# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-
-# tokens-alongside-lora
-peft_trainable_token_indices: list[int] | dict[str, list[int]] | None
-# Whether to tie adapter weights for tied model weights. See
-# https://github.com/huggingface/peft/issues/2864
-peft_ensure_weight_tying: bool | None
-# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.
-peft_autocast_adapter_dtype: bool | None
-
-# load qlora model in sharded format for FSDP using answer.ai technique.
-qlora_sharded_model_loading: bool | None = False
-# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it
-# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
-lora_on_cpu: bool | None
-# Whether you are training a 4-bit GPTQ quantized model
-gptq: bool | None
-# optional overrides to the bnb 4bit quantization configuration
-bnb_config_kwargs: dict[str, Any] | None
-
-# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
-loraplus_lr_ratio: float | None
-# loraplus learning rate for lora embedding layers. Default value is 1e-6.
-loraplus_lr_embedding: float | None = 1e-06
-
-merge_lora: bool | None
-
-# Whether to use ReLoRA. Use with jagged_restart_*steps options.
-relora: bool | None
-# threshold for optimizer magnitude when pruning
-relora_prune_ratio: float | None
-# True to perform lora weight merges on cpu during restarts, for modest gpu memory
-# savings
-relora_cpu_offload: bool | None
-
-# how often to reset for jagged restarts
-jagged_restart_steps: int | None
-# how many warmup steps to take after reset for jagged restarts
-jagged_restart_warmup_steps: int | None
-# how many anneal steps to take before reset for jagged restarts
-jagged_restart_anneal_steps: int | None
-
-# If greater than 1, backpropagation will be skipped and the gradients will be
-# accumulated for the given number of steps.
-gradient_accumulation_steps: int | None = 1
-# The number of samples to include in each batch. This is the number of samples sent to
-# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps
-micro_batch_size: int | None = 1
-# Total batch size, we do not recommended setting this manually
-batch_size: int | None
-# per gpu micro batch size for evals, defaults to value of micro_batch_size
-eval_batch_size: int | None
+# Where to save the full-finetuned model to
+output_dir: str = ./model-out
+# push checkpoints to hub
+hub_model_id: str | None
+# how to push checkpoints to hub
+hub_strategy: str | None
+# branch/revision to push to on hub (default: main)
+hub_revision: str | None
+# Whether to save the model using safetensors format. Defaults to True.
+save_safetensors: bool | None = True
+
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+load_in_8bit: bool | None = False
+# Use bitsandbytes 4 bit
+load_in_4bit: bool | None = False
+
+# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all
+# parameters in original model
+adapter: Literal['lora', 'qlora', 'llama-adapter'] | None
+# If you already have a lora model trained that you want to load, put that here. This
+# means after training, if you want to test the model, you should set this to the value
+# of `output_dir`. Note that if you merge an adapter to the base model, a new
+# subdirectory `merged` will be created under the `output_dir`.
+lora_model_dir: str | None
+lora_r: int | None
+lora_alpha: int | None
+lora_fan_in_fan_out: bool | None
+lora_target_modules: str | list[str] | None
+lora_target_parameters: str | list[str] | None
+# If true, will target all linear modules
+lora_target_linear: bool | None
+# If you added new tokens to the tokenizer, you may need to save some LoRA modules
+# because they need to know the new tokens. For LLaMA and Mistral, you need to save
+# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts
+# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+lora_modules_to_save: list[str] | None
+lora_dropout: float | None = 0.0
+# The layer indices to transform, otherwise, apply to all layers
+peft_layers_to_transform: list[int] | None
+peft_layers_pattern: list[str] | None
+
+peft: PeftConfig | None
+ # For PeftConfig:
+ # Configuration options for loftq initialization for LoRA
+ loftq_config: LoftQConfig | None
+ # For LoftQConfig:
+ # typically 4 bits
+ loftq_bits: int = 4
+
+# Whether to use DoRA.
+peft_use_dora: bool | None
+# Whether to use RSLoRA.
+peft_use_rslora: bool | None
+# List of layer indices to replicate.
+peft_layer_replication: list[tuple[int, int]] | None
+# How to initialize LoRA weights. Default to True which is MS original implementation.
+peft_init_lora_weights: bool | str | None
+# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict
+# mapping an embedding layer name to its trainable token indices. See
+# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-
+# tokens-alongside-lora
+peft_trainable_token_indices: list[int] | dict[str, list[int]] | None
+# Whether to tie adapter weights for tied model weights. See
+# https://github.com/huggingface/peft/issues/2864
+peft_ensure_weight_tying: bool | None
+# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.
+peft_autocast_adapter_dtype: bool | None
+
+# load qlora model in sharded format for FSDP using answer.ai technique.
+qlora_sharded_model_loading: bool | None = False
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it
+# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: bool | None
+# Whether you are training a 4-bit GPTQ quantized model
+gptq: bool | None
+# optional overrides to the bnb 4bit quantization configuration
+bnb_config_kwargs: dict[str, Any] | None
+
+# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
+loraplus_lr_ratio: float | None
+# loraplus learning rate for lora embedding layers. Default value is 1e-6.
+loraplus_lr_embedding: float | None = 1e-06
+
+merge_lora: bool | None
+
+# Whether to use ReLoRA. Use with jagged_restart_*steps options.
+relora: bool | None
+# threshold for optimizer magnitude when pruning
+relora_prune_ratio: float | None
+# True to perform lora weight merges on cpu during restarts, for modest gpu memory
+# savings
+relora_cpu_offload: bool | None
+
+# how often to reset for jagged restarts
+jagged_restart_steps: int | None
+# how many warmup steps to take after reset for jagged restarts
+jagged_restart_warmup_steps: int | None
+# how many anneal steps to take before reset for jagged restarts
+jagged_restart_anneal_steps: int | None
-# whether to find batch size that fits in memory. Passed to underlying transformers
-# Trainer
-auto_find_batch_size: bool | None
-
-# Whether to mask out or include the human's prompt from the training labels
-train_on_inputs: bool | None = False
-# Group similarly sized data to minimize padding. May be slower to start, as it must
-# download and sort the entire dataset. Note that training loss may have an oscillating
-# pattern with this enabled.
-group_by_length: bool | None
+# If greater than 1, backpropagation will be skipped and the gradients will be
+# accumulated for the given number of steps.
+gradient_accumulation_steps: int | None = 1
+# The number of samples to include in each batch. This is the number of samples sent to
+# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps
+micro_batch_size: int | None = 1
+# Total batch size, we do not recommended setting this manually
+batch_size: int | None
+# per gpu micro batch size for evals, defaults to value of micro_batch_size
+eval_batch_size: int | None
-learning_rate: str | float (required)
-embedding_lr: float | None
-embedding_lr_scale: float | None
-# Specify weight decay
-weight_decay: float | None = 0.0
-# Specify optimizer
-optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED
-# Dictionary of arguments to pass to the optimizer
-optim_args: str | dict[str, Any] | None
-# The target modules to optimize, i.e. the module names that you would like to train,
-# right now this is used only for GaLore algorithm
-optim_target_modules: list[str] | Literal['all_linear'] | None
-# Path to torch distx for optim 'adamw_anyprecision'
-torchdistx_path: str | None
-lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE
-# Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler_kwargs: dict[str, Any] | None
-lr_quadratic_warmup: bool | None
-# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of
-# peak lr
-cosine_min_lr_ratio: float | None
-# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means
-# start cosine_min_lr at 80% of training step
-cosine_constant_lr_ratio: float | None
-# Learning rate div factor
-lr_div_factor: float | None
-
-lr_groups: list[LrGroup] | None
- # For LrGroup:
- name: str (required)
- modules: list[str] (required)
- lr: float (required)
-
-# adamw hyperparams
-adam_epsilon: float | None
-# only used for CAME Optimizer
-adam_epsilon2: float | None
-# adamw hyperparams
-adam_beta1: float | None
-# adamw hyperparams
-adam_beta2: float | None
-# only used for CAME Optimizer
-adam_beta3: float | None
+# whether to find batch size that fits in memory. Passed to underlying transformers
+# Trainer
+auto_find_batch_size: bool | None
+
+# Whether to mask out or include the human's prompt from the training labels
+train_on_inputs: bool | None = False
+# Group similarly sized data to minimize padding. May be slower to start, as it must
+# download and sort the entire dataset. Note that training loss may have an oscillating
+# pattern with this enabled.
+group_by_length: bool | None
+
+learning_rate: str | float (required)
+embedding_lr: float | None
+embedding_lr_scale: float | None
+# Specify weight decay
+weight_decay: float | None = 0.0
+# Specify optimizer
+optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED
+# Dictionary of arguments to pass to the optimizer
+optim_args: str | dict[str, Any] | None
+# The target modules to optimize, i.e. the module names that you would like to train,
+# right now this is used only for GaLore algorithm
+optim_target_modules: list[str] | Literal['all_linear'] | None
+# Path to torch distx for optim 'adamw_anyprecision'
+torchdistx_path: str | None
+lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE
+# Specify a scheduler and kwargs to use with the optimizer
+lr_scheduler_kwargs: dict[str, Any] | None
+lr_quadratic_warmup: bool | None
+# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of
+# peak lr
+cosine_min_lr_ratio: float | None
+# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means
+# start cosine_min_lr at 80% of training step
+cosine_constant_lr_ratio: float | None
+# Learning rate div factor
+lr_div_factor: float | None
+
+lr_groups: list[LrGroup] | None
+ # For LrGroup:
+ name: str (required)
+ modules: list[str] (required)
+ lr: float (required)
-# Dion Optimizer learning rate
-dion_lr: float | None
-# Dion Optimizer momentum
-dion_momentum: float | None
-# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank
-# dimension.
-dion_rank_fraction: float | None = 1.0
-# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may
-# be useful to ensure even sharding.
-dion_rank_multiple_of: int | None = 1
+# adamw hyperparams
+adam_epsilon: float | None
+# only used for CAME Optimizer
+adam_epsilon2: float | None
+# adamw hyperparams
+adam_beta1: float | None
+# adamw hyperparams
+adam_beta2: float | None
+# only used for CAME Optimizer
+adam_beta3: float | None
-# Gradient clipping max norm
-max_grad_norm: float | None
-num_epochs: float = 1.0
-
-use_wandb: bool | None
-# Set the name of your wandb run
-wandb_name: str | None
-# Set the ID of your wandb run
-wandb_run_id: str | None
-# "offline" to save run metadata locally and not sync to the server, "disabled" to turn
-# off wandb
-wandb_mode: str | None
-# Your wandb project name
-wandb_project: str | None
-# A wandb Team name if using a Team
-wandb_entity: str | None
-wandb_watch: str | None
-# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only
-# at the end of training
-wandb_log_model: str | None
-
-use_mlflow: bool | None
-# URI to mlflow
-mlflow_tracking_uri: str | None
-# Your experiment name
-mlflow_experiment_name: str | None
-# Your run name
-mlflow_run_name: str | None
-# set to true to copy each saved checkpoint on each save to mlflow artifact registry
-hf_mlflow_log_artifacts: bool | None
-
-# Enable or disable Comet integration.
-use_comet: bool | None
-# API key for Comet. Recommended to set via `comet login`.
-comet_api_key: str | None
-# Workspace name in Comet. Defaults to the user's default workspace.
-comet_workspace: str | None
-# Project name in Comet. Defaults to Uncategorized.
-comet_project_name: str | None
-# Identifier for the experiment. Used to append data to an existing experiment or
-# control the key of new experiments. Default to a random key.
-comet_experiment_key: str | None
-# Create a new experiment ("create") or log to an existing one ("get"). Default
-# ("get_or_create") auto-selects based on configuration.
-comet_mode: str | None
-# Set to True to log data to Comet server, or False for offline storage. Default is
-# True.
-comet_online: bool | None
-# Dictionary for additional configuration settings, see the doc for more details.
-comet_experiment_config: dict[str, Any] | None
-
-use_trackio: bool | None
-# Your trackio project name
-trackio_project_name: str | None
-# Set the name of your trackio run
-trackio_run_name: str | None
-# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)
-trackio_space_id: str | None
-
-# Enable OpenTelemetry metrics collection and Prometheus export
-use_otel_metrics: bool | None = False
-# Host to bind the OpenTelemetry metrics server to
-otel_metrics_host: str | None = localhost
-# Port for the Prometheus metrics HTTP server
-otel_metrics_port: int | None = 8000
-
-# the number of activate layers in LISA
-lisa_n_layers: int | None
-# how often to switch layers in LISA
-lisa_step_interval: int | None
-# path under the model to access the layers
-lisa_layers_attribute: str | None = model.layers
-
-gradio_title: str | None
-gradio_share: bool | None
-gradio_server_name: str | None
-gradio_server_port: int | None
-gradio_max_new_tokens: int | None
-gradio_temperature: float | None
-
-use_ray: bool = False
-ray_run_name: str | None
-ray_num_workers: int = 1
-resources_per_worker: dict
-
-# The size of the image to resize to. It can be an integer (resized into padded-square
-# image) or a tuple (width, height).If not provided, we will attempt to load from
-# preprocessor.size, otherwise, images won't be resized.
-image_size: int | tuple[int, int] | None
-# The resampling algorithm to use for image resizing. Default is bilinear. Please refer
-# to PIL.Image.Resampling for more details.
-image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None
-
-# optional overrides to the base model configuration
-overrides_of_model_config: dict[str, Any] | None
-# optional overrides the base model loading from_pretrained
-overrides_of_model_kwargs: dict[str, Any] | None
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good
-# choice too
-type_of_model: str | None
-# You can specify to choose a specific model revision from huggingface hub
-revision_of_model: str | None
-
-max_packed_sequence_len: int | None
-rope_scaling: Any | None
-noisy_embedding_alpha: float | None
-dpo_beta: float | None
-evaluation_strategy: str | None