accidental file
This commit is contained in:
1
.github/workflows/tests.yml
vendored
1
.github/workflows/tests.yml
vendored
@@ -371,7 +371,6 @@ jobs:
|
||||
directory: coverage-reports
|
||||
fail_ci_if_error: false
|
||||
verbose: true
|
||||
carryforward: true
|
||||
name: codecov-umbrella
|
||||
override_commit: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
override_pr: ${{ github.event.pull_request.number }}
|
||||
|
||||
@@ -1,780 +0,0 @@
|
||||
---
|
||||
title: Config Reference
|
||||
description: A complete list of all configuration options.
|
||||
---
|
||||
|
||||
```yaml
|
||||
# Allow overwrite yml config using from cli
|
||||
strict: bool | None = False
|
||||
# Resume from a specific checkpoint dir
|
||||
resume_from_checkpoint: str | None
|
||||
# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
||||
# Be careful with this being turned on between different models.
|
||||
auto_resume_from_checkpoints: bool | None
|
||||
# Resize the model embeddings when new tokens are added to multiples of 32. This is
|
||||
# reported to improve training speed on some models
|
||||
resize_token_embeddings_to_32x: bool | None
|
||||
mean_resizing_embeddings: bool | None = False
|
||||
|
||||
# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
|
||||
shrink_embeddings: bool | None
|
||||
# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
|
||||
embeddings_skip_upcast: bool | None
|
||||
|
||||
# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
|
||||
rl: RLType | None
|
||||
|
||||
trl:
|
||||
# Beta parameter for the RL training. Same as `rl_beta`. Use
|
||||
beta: float | None
|
||||
# Maximum length of the completion for RL training.
|
||||
max_completion_length: int | None
|
||||
|
||||
# Whether to use VLLM for RL training.
|
||||
use_vllm: bool = False
|
||||
# Host of the vLLM server to connect to.
|
||||
vllm_server_host: str | None = 0.0.0.0
|
||||
# Port of the vLLM server to connect to.
|
||||
vllm_server_port: int | None = 8000
|
||||
# Total timeout (in seconds) to wait for the vLLM server to respond.
|
||||
vllm_server_timeout: int | None
|
||||
# Regex for vLLM guided decoding.
|
||||
vllm_guided_decoding_regex: str | None
|
||||
|
||||
# List of reward functions to load. Paths must be importable from current dir.
|
||||
reward_funcs: list[str] | None
|
||||
# List of reward weights for the reward functions.
|
||||
reward_weights: list[float] | None
|
||||
# Number of generations to sample.
|
||||
num_generations: int | None
|
||||
# Whether to log completions.
|
||||
log_completions: bool | None = False
|
||||
# Number of completions to print when log_completions is True.
|
||||
num_completions_to_print: int | None
|
||||
# Whether to sync the reference model.
|
||||
sync_ref_model: bool | None = False
|
||||
# Mixup alpha for the reference model.
|
||||
ref_model_mixup_alpha: float | None = 0.9
|
||||
# Sync steps for the reference model.
|
||||
ref_model_sync_steps: int | None = 64
|
||||
# Whether to scale rewards by their standard deviation.
|
||||
scale_rewards: bool = True
|
||||
|
||||
# Sampling temperature for the GRPO policy.
|
||||
temperature: float | None
|
||||
# Top-p sampling probability for the generation policy.
|
||||
top_p: float | None
|
||||
# Top-k sampling for the generation policy.
|
||||
top_k: int | None
|
||||
# Minimum probability for the generation policy.
|
||||
min_p: float | None
|
||||
# Penalty for tokens that appear in prompt and generated text.
|
||||
repetition_penalty: float | None
|
||||
# Number of iterations per batch (μ) for GRPO.
|
||||
num_iterations: int | None
|
||||
# Epsilon value for clipping in the GRPO algorithm.
|
||||
epsilon: float | None
|
||||
# Upper-bound epsilon value for clipping in the GRPO algorithm.
|
||||
epsilon_high: float | None
|
||||
# Whether to use Liger loss for GRPO.
|
||||
use_liger_loss: bool | None
|
||||
# Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
|
||||
loss_type: str | None
|
||||
# Whether to exclude truncated completions from loss calculation.
|
||||
mask_truncated_completions: bool = False
|
||||
|
||||
vllm:
|
||||
# Device to use for VLLM
|
||||
device: str | None = auto
|
||||
# Tensor parallel size for VLLM
|
||||
tensor_parallel_size: int | None
|
||||
# GPU memory utilization for VLLM
|
||||
gpu_memory_utilization: float | None = 0.9
|
||||
# Data type for VLLM
|
||||
dtype: str | None = auto
|
||||
# Maximum length of the model context for VLLM
|
||||
max_model_len: int | None
|
||||
# Enable prefix caching for VLLM
|
||||
enable_prefix_caching: bool | None
|
||||
# Host for the vLLM server to start on
|
||||
host: str | None = 0.0.0.0
|
||||
# Port of the vLLM server to start on
|
||||
port: int | None = 8000
|
||||
|
||||
# Enable reasoning for VLLM
|
||||
enable_reasoning: bool | None
|
||||
# Reasoning parser for VLLM
|
||||
reasoning_parser: str | None
|
||||
|
||||
qat:
|
||||
# Fake quantization layout to use for activation quantization. Valid options are
|
||||
# "int4" and "int8"
|
||||
activation_dtype: TorchIntDType | None
|
||||
# Fake quantization layout to use for weight quantization. Valid options are "int4"
|
||||
# and "int8"
|
||||
weight_dtype: TorchIntDType = TorchIntDType.int8
|
||||
# Quantize embedding
|
||||
quantize_embedding: bool | None = False
|
||||
# The number of elements in each group for per-group fake quantization
|
||||
group_size: int | None = 32
|
||||
# The number of steps to apply fake quantization after
|
||||
fake_quant_after_n_steps: int | None
|
||||
|
||||
quantization:
|
||||
# Fake quantization layout to use for weight quantization. Valid options are uintX for
|
||||
# X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
|
||||
weight_dtype: TorchIntDType = TorchIntDType.int8
|
||||
# Fake quantization layout to use for activation quantization. Valid options are
|
||||
# "int4" and "int8"
|
||||
activation_dtype: TorchIntDType | None
|
||||
# Whether to quantize the embedding layer.
|
||||
quantize_embedding: bool | None
|
||||
# The number of elements in each group for per-group fake quantization
|
||||
group_size: int | None = 32
|
||||
|
||||
# Reward modelling: `True` or `False`
|
||||
reward_model: bool | None
|
||||
# Process reward modelling: `True` or `False`
|
||||
process_reward_model: bool | None
|
||||
num_labels: int | None
|
||||
|
||||
# Whether to perform weighting in DPO trainer
|
||||
dpo_use_weighting: bool | None
|
||||
dpo_use_logits_to_keep: bool | None
|
||||
dpo_label_smoothing: float | None
|
||||
dpo_norm_loss: bool | None
|
||||
dpo_padding_free: bool | None
|
||||
|
||||
# A list of one or more datasets to finetune the model with
|
||||
datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None
|
||||
|
||||
# A list of one or more datasets to eval the model with. You can use either
|
||||
# test_datasets, or val_set_size, but not both.
|
||||
test_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None
|
||||
# If false, the datasets will not be shuffled and will keep their original order in
|
||||
# `datasets`. The same applies to the `test_datasets` option and the
|
||||
# `pretraining_dataset` option. Default is true.
|
||||
shuffle_merged_datasets: bool | None = True
|
||||
# Axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||
# subsequent training attempts load faster, relative path
|
||||
dataset_prepared_path: str | None
|
||||
# Num shards for whole dataset
|
||||
dataset_shard_num: int | None
|
||||
# Index of shard to use for whole dataset
|
||||
dataset_shard_idx: int | None
|
||||
skip_prepare_dataset: bool | None = False
|
||||
|
||||
# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
|
||||
pretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None
|
||||
# The maximum number of processes to use while preprocessing your input dataset. This
|
||||
# defaults to `os.cpu_count()` if not set.
|
||||
dataset_processes: int | None = 14
|
||||
# Deduplicates datasets and test_datasets with identical entries
|
||||
dataset_exact_deduplication: bool | None
|
||||
# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking
|
||||
# too much storage
|
||||
dataset_keep_in_memory: bool | None
|
||||
dataloader_pin_memory: bool | None
|
||||
dataloader_num_workers: int | None
|
||||
dataloader_prefetch_factor: int | None
|
||||
dataloader_drop_last: bool | None
|
||||
|
||||
accelerator_config: dict[str, Any] | None
|
||||
|
||||
remove_unused_columns: bool | None
|
||||
|
||||
# Push prepared dataset to hub - repo_org/repo_name
|
||||
push_dataset_to_hub: str | None
|
||||
# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private
|
||||
# datasets. Required to be true when used in combination with `push_dataset_to_hub`
|
||||
hf_use_auth_token: bool | None
|
||||
|
||||
device: Any | None
|
||||
# Passed through to transformers when loading the model when launched without
|
||||
# accelerate. Use `sequential` when training w/ model parallelism to limit memory
|
||||
device_map: Any | None
|
||||
world_size: int | None
|
||||
# Don't mess with this, it's here for accelerate and torchrun
|
||||
local_rank: int | None
|
||||
ddp: bool | None
|
||||
|
||||
# Seed for reproducibility
|
||||
seed: int | None
|
||||
# Advanced DDP Arguments - timeout
|
||||
ddp_timeout: int | None
|
||||
# Advanced DDP Arguments - bucket cap in MB
|
||||
ddp_bucket_cap_mb: int | None
|
||||
# Advanced DDP Arguments - broadcast buffers
|
||||
ddp_broadcast_buffers: bool | None
|
||||
ddp_find_unused_parameters: bool | None
|
||||
|
||||
# Approximate number of predictions sent to wandb depending on batch size. Enabled above
|
||||
# 0. Default is 0
|
||||
eval_table_size: int | None
|
||||
# Total number of tokens generated for predictions sent to wandb. Default is 128
|
||||
eval_max_new_tokens: int | None
|
||||
# Whether to run causal language model evaluation for metrics in
|
||||
# `eval_causal_lm_metrics`
|
||||
do_causal_lm_eval: bool | None
|
||||
# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',
|
||||
# 'chrf', 'perplexity']
|
||||
eval_causal_lm_metrics: list[str] | None
|
||||
do_bench_eval: bool | None
|
||||
bench_dataset: str | None
|
||||
bench_split: str | None
|
||||
metric_for_best_model: str | None
|
||||
greater_is_better: bool | None
|
||||
|
||||
# High loss value, indicating the learning has broken down (a good estimate is ~2 times
|
||||
# the loss at the start of training)
|
||||
loss_watchdog_threshold: float | None
|
||||
# Number of high-loss steps in a row before the trainer aborts (default: 3)
|
||||
loss_watchdog_patience: int | None
|
||||
|
||||
gc_steps: int | None
|
||||
|
||||
# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.
|
||||
# require >=ampere
|
||||
bf16: Literal['auto'] | bool | None = auto
|
||||
# Use CUDA fp16
|
||||
fp16: bool | None
|
||||
fp8: bool | None
|
||||
# No AMP (automatic mixed precision) - require >=ampere
|
||||
bfloat16: bool | None
|
||||
# No AMP (automatic mixed precision)
|
||||
float16: bool | None
|
||||
# Use CUDA tf32 - require >=ampere
|
||||
tf32: bool | None
|
||||
float32: bool | None
|
||||
|
||||
# Whether to use gradient checkpointing. Available options are: true, false, 'offload',
|
||||
# 'offload_disk'.
|
||||
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||
gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False
|
||||
# Additional kwargs to pass to the trainer for gradient checkpointing
|
||||
gradient_checkpointing_kwargs: dict[str, Any] | None
|
||||
|
||||
unfrozen_parameters: list[str] | None
|
||||
|
||||
# The maximum length of an input to train with, this should typically be less than 2048
|
||||
# as most models have a token/context limit of 2048
|
||||
sequence_len: int = 512
|
||||
min_sample_len: int | None
|
||||
# maximum prompt length for RL training
|
||||
max_prompt_len: int = 512
|
||||
# Use efficient multi-packing with block diagonal attention and per sequence
|
||||
# position_ids. Recommend set to 'true'
|
||||
sample_packing: bool | None
|
||||
# The number of samples packed at a time. Increasing the following values helps with
|
||||
# packing, but usually only slightly (<%1.)
|
||||
sample_packing_group_size: int | None = 100000
|
||||
# The number of samples which can be packed into one sequence. Increase if using a large
|
||||
# sequence_len with many short samples.
|
||||
sample_packing_bin_size: int | None = 200
|
||||
# Whether to pack samples sequentially
|
||||
sample_packing_sequentially: bool | None
|
||||
# Set to 'false' if getting errors during eval with sample_packing on
|
||||
eval_sample_packing: bool | None
|
||||
# Pad inputs so each step uses constant sized buffers. This will reduce memory
|
||||
# fragmentation and may prevent OOMs, by re-using memory more efficiently
|
||||
pad_to_sequence_len: bool | None
|
||||
# Whether to use sequential sampling for curriculum learning
|
||||
curriculum_sampling: bool | None
|
||||
multipack_real_batches: bool | None
|
||||
# whether to concatenate samples during pretraining
|
||||
pretraining_sample_concatenation: bool | None
|
||||
|
||||
# Use batch flattening for speedups when not using sample_packing
|
||||
batch_flattening: Literal['auto'] | bool | None
|
||||
|
||||
use_pose: bool | None
|
||||
pose_split_on_token_ids: list[int] | None
|
||||
pose_max_context_len: int | None
|
||||
pose_num_chunks: int | None
|
||||
|
||||
pretrain_multipack_buffer_size: int | None = 10000
|
||||
# whether to prevent cross attention for packed sequences during pretraining
|
||||
pretrain_multipack_attn: bool | None = True
|
||||
|
||||
# Whether to use xformers attention patch https://github.com/facebookresearch/xformers
|
||||
xformers_attention: bool | None
|
||||
# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/
|
||||
# torch.nn.functional.scaled_dot_product_attention.html
|
||||
sdp_attention: bool | None
|
||||
# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
||||
s2_attention: bool | None
|
||||
flex_attention: bool | None
|
||||
flex_attn_compile_kwargs: dict[str, Any] | None
|
||||
# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention
|
||||
flash_attention: bool | None
|
||||
# Whether to use flash-attention cross entropy implementation - advanced use only
|
||||
flash_attn_cross_entropy: bool | None
|
||||
# Whether to use flash-attention rms norm implementation - advanced use only
|
||||
flash_attn_rms_norm: bool | None
|
||||
# Whether to fuse QKV into a single operation
|
||||
flash_attn_fuse_qkv: bool | None
|
||||
# Whether to fuse part of the MLP into a single operation
|
||||
flash_attn_fuse_mlp: bool | None
|
||||
# Whether to use bettertransformers
|
||||
flash_optimum: bool | None
|
||||
|
||||
eager_attention: bool | None
|
||||
|
||||
unsloth_cross_entropy_loss: bool | None
|
||||
unsloth_lora_mlp: bool | None
|
||||
unsloth_lora_qkv: bool | None
|
||||
unsloth_lora_o: bool | None
|
||||
unsloth_rms_norm: bool | None
|
||||
unsloth_rope: bool | None
|
||||
|
||||
# Apply custom LoRA autograd functions and activation function Triton kernels for speed
|
||||
# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
|
||||
lora_mlp_kernel: bool | None
|
||||
# Apply custom LoRA autograd functions and activation function Triton kernels for speed
|
||||
# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
|
||||
lora_qkv_kernel: bool | None
|
||||
# Apply custom LoRA autograd functions and activation function Triton kernels for speed
|
||||
# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
|
||||
lora_o_kernel: bool | None
|
||||
|
||||
llama4_linearized_experts: bool | None
|
||||
|
||||
# Deepspeed config path. e.g., deepspeed_configs/zero3.json
|
||||
deepspeed: str | dict[str, Any] | None
|
||||
# FSDP configuration
|
||||
fsdp: list[str] | None
|
||||
# FSDP configuration options
|
||||
fsdp_config: dict[str, Any] | None
|
||||
fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
|
||||
|
||||
# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for
|
||||
# no eval.
|
||||
val_set_size: float | None = 0.0
|
||||
|
||||
# Set to a divisor of the number of GPUs available to split sequences into chunks of
|
||||
# equal size. Use in long context training to prevent OOM when sequences cannot fit into
|
||||
# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each
|
||||
# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized
|
||||
# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more
|
||||
# details.
|
||||
sequence_parallel_degree: int | None
|
||||
# Optional; strides across the key dimension. Larger values use more memory but should
|
||||
# make training faster. Must evenly divide the number of KV heads in your model.
|
||||
heads_k_stride: int | None
|
||||
# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to
|
||||
# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing
|
||||
# case.
|
||||
ring_attn_func: RingAttnFunc | None
|
||||
|
||||
# Add or change special tokens. If you add tokens here, you don't need to add them to
|
||||
# the `tokens` list.
|
||||
special_tokens:
|
||||
bos_token: str | None
|
||||
eos_token: str | None
|
||||
pad_token: str | None
|
||||
unk_token: str | None
|
||||
additional_special_tokens: list[str] | None
|
||||
|
||||
# Add extra tokens to the tokenizer
|
||||
tokens: list[str] | None
|
||||
# Mapping token_id to new_token_string to override reserved added_tokens in the
|
||||
# tokenizer. Only works for tokens that are not part of the base vocab (aka are
|
||||
# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.
|
||||
added_tokens_overrides: dict[int, str] | None
|
||||
|
||||
# Whether to use torch.compile and which backend to use. setting to `auto` will enable
|
||||
# torch compile when torch>=2.5.1
|
||||
torch_compile: Literal['auto'] | bool | None
|
||||
# Backend to use for torch.compile
|
||||
torch_compile_backend: str | None
|
||||
torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None
|
||||
|
||||
# Maximum number of iterations to train for. It precedes num_epochs which means that if
|
||||
# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>
|
||||
# `num_epochs: 2` and `max_steps: 100` will train for 100 steps
|
||||
max_steps: int | None
|
||||
# Number of warmup steps. Cannot use with warmup_ratio
|
||||
warmup_steps: int | None
|
||||
# Warmup ratio. Cannot use with warmup_steps
|
||||
warmup_ratio: float | None
|
||||
# Leave empty to eval at each epoch, integer for every N steps. float for fraction of
|
||||
# total steps
|
||||
eval_steps: int | float | None
|
||||
# Number of times per epoch to run evals, mutually exclusive with eval_steps
|
||||
evals_per_epoch: int | None
|
||||
# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer
|
||||
# from `eval_steps`
|
||||
eval_strategy: str | None
|
||||
# Leave empty to save at each epoch, integer for every N steps. float for fraction of
|
||||
# total steps
|
||||
save_steps: int | float | None
|
||||
# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
||||
saves_per_epoch: int | None
|
||||
# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better
|
||||
# result is achieved, leave empty to infer from `save_steps`
|
||||
save_strategy: str | None
|
||||
# Checkpoints saved at a time
|
||||
save_total_limit: int | None
|
||||
# Logging frequency
|
||||
logging_steps: int | None
|
||||
# Stop training after this many evaluation losses have increased in a row. https://huggi
|
||||
# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin
|
||||
# gCallback
|
||||
early_stopping_patience: int | None
|
||||
load_best_model_at_end: bool | None = False
|
||||
# Save only the model weights, skipping the optimizer. Using this means you can't resume
|
||||
# from checkpoints.
|
||||
save_only_model: bool | None = False
|
||||
# Use tensorboard for logging
|
||||
use_tensorboard: bool | None
|
||||
# Enable the pytorch profiler to capture the first N steps of training to the
|
||||
# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more
|
||||
# information. Snapshots can be visualized @ https://pytorch.org/memory_viz
|
||||
profiler_steps: int | None
|
||||
# bool of whether to include tokens trainer per second in the training metrics. This
|
||||
# iterates over the entire dataset once, so it takes some time.
|
||||
include_tokens_per_second: bool | None
|
||||
|
||||
# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to
|
||||
# add noise to embeddings. Currently only supported on Llama and Mistral
|
||||
neftune_noise_alpha: float | None
|
||||
|
||||
# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to
|
||||
# `beta` in `ORPOConfig` due to trl mapping.
|
||||
orpo_alpha: float | None
|
||||
# Weighting of NLL term in loss from RPO paper
|
||||
rpo_alpha: float | None
|
||||
# Target reward margin for the SimPO loss
|
||||
simpo_gamma: float | None
|
||||
# Weight of the BC regularizer
|
||||
cpo_alpha: float | None
|
||||
|
||||
# Factor for desirable loss term in KTO loss
|
||||
kto_desirable_weight: float | None
|
||||
# Factor for undesirable loss term in KTO loss
|
||||
kto_undesirable_weight: float | None
|
||||
# The beta parameter for the RL training
|
||||
rl_beta: float | None
|
||||
|
||||
# Defines the max memory usage per gpu on the system. Passed through to transformers
|
||||
# when loading the model.
|
||||
max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None
|
||||
# Limit the memory for all available GPUs to this amount (if an integer, expressed in
|
||||
# gigabytes); default: unset
|
||||
gpu_memory_limit: int | str | None
|
||||
# Whether to use low_cpu_mem_usage
|
||||
low_cpu_mem_usage: bool | None
|
||||
|
||||
# The name of the chat template to use for training, following values are supported:
|
||||
# tokenizer_default: Uses the chat template that is available in the
|
||||
# tokenizer_config.json. If the chat template is not available in the tokenizer, it will
|
||||
# raise an error. This is the default value.
|
||||
# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
|
||||
# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
|
||||
# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.
|
||||
# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not
|
||||
# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.
|
||||
# The custom jinja template should be provided in the chat_template_jinja field. The
|
||||
# selected chat template will be saved to the tokenizer_config.json for easier
|
||||
# inferencing
|
||||
chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None
|
||||
# Custom jinja template for chat template. This will be only used if chat_template is
|
||||
# set to `jinja` or `null` (in which case chat_template is automatically set to
|
||||
# `jinja`). Default is null.
|
||||
chat_template_jinja: str | None
|
||||
# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the
|
||||
# boundaries between conversation turns. For example: ['/INST', '</s>',
|
||||
# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is
|
||||
# useful for templates that use multiple delimiter tokens.
|
||||
eot_tokens: list[str] | None
|
||||
# Changes the default system message. Currently only supports chatml.
|
||||
default_system_message: str | None
|
||||
|
||||
fix_untrained_tokens: int | list[int] | None
|
||||
|
||||
is_preprocess: bool | None
|
||||
preprocess_iterable: bool | None
|
||||
|
||||
# Total number of tokens - internal use
|
||||
total_num_tokens: int | None
|
||||
total_supervised_tokens: int | None
|
||||
# You can set these packing optimizations AFTER starting a training at least once. The
|
||||
# trainer will provide recommended values for these values.
|
||||
sample_packing_eff_est: float | None
|
||||
axolotl_config_path: str | None
|
||||
|
||||
# Internal use only - Used to identify which the model is based on
|
||||
is_falcon_derived_model: bool | None
|
||||
# Internal use only - Used to identify which the model is based on
|
||||
is_llama_derived_model: bool | None
|
||||
# Internal use only - Used to identify which the model is based on. Please note that if
|
||||
# you set this to true, `padding_side` will be set to 'left' by default
|
||||
is_mistral_derived_model: bool | None
|
||||
# Internal use only - Used to identify which the model is based on
|
||||
is_qwen_derived_model: bool | None
|
||||
|
||||
# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available
|
||||
# plugins or doc below for more details.
|
||||
# https://docs.axolotl.ai/docs/custom_integrations.html
|
||||
plugins: list[str] | None
|
||||
|
||||
# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This
|
||||
# can also be a relative path to a model on disk
|
||||
base_model: str (required)
|
||||
# If the base_model repo on hf hub doesn't include configuration .json files, You can
|
||||
# set that here, or leave this empty to default to base_model
|
||||
base_model_config: str | None
|
||||
cls_model_config: str | None
|
||||
# Optional tokenizer configuration path in case you want to use a different tokenizer
|
||||
# than the one defined in the base model
|
||||
tokenizer_config: str | None
|
||||
# use_fast option for tokenizer loading from_pretrained, default to True
|
||||
tokenizer_use_fast: bool | None
|
||||
# Whether to use the legacy tokenizer setting, defaults to True
|
||||
tokenizer_legacy: bool | None
|
||||
# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-
|
||||
# common tokenizer.
|
||||
tokenizer_use_mistral_common: bool | None
|
||||
# Corresponding tokenizer for the model AutoTokenizer is a good choice
|
||||
tokenizer_type: str | None
|
||||
# transformers processor class
|
||||
processor_type: str | None
|
||||
# Trust remote code for untrusted source
|
||||
trust_remote_code: bool | None
|
||||
|
||||
# Where to save the full-finetuned model to
|
||||
output_dir: str = ./model-out
|
||||
# push checkpoints to hub
|
||||
hub_model_id: str | None
|
||||
# how to push checkpoints to hub
|
||||
hub_strategy: str | None
|
||||
# Save model as safetensors (require safetensors package). Default True
|
||||
save_safetensors: bool | None = True
|
||||
|
||||
# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
|
||||
load_in_8bit: bool | None = False
|
||||
# Use bitsandbytes 4 bit
|
||||
load_in_4bit: bool | None = False
|
||||
|
||||
# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in
|
||||
# original model
|
||||
adapter: str | None
|
||||
# If you already have a lora model trained that you want to load, put that here. This
|
||||
# means after training, if you want to test the model, you should set this to the value
|
||||
# of `output_dir`. Note that if you merge an adapter to the base model, a new
|
||||
# subdirectory `merged` will be created under the `output_dir`.
|
||||
lora_model_dir: str | None
|
||||
lora_r: int | None
|
||||
lora_alpha: int | None
|
||||
lora_fan_in_fan_out: bool | None
|
||||
lora_target_modules: str | list[str] | None
|
||||
# If true, will target all linear modules
|
||||
lora_target_linear: bool | None
|
||||
# If you added new tokens to the tokenizer, you may need to save some LoRA modules
|
||||
# because they need to know the new tokens. For LLaMA and Mistral, you need to save
|
||||
# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts
|
||||
# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
|
||||
lora_modules_to_save: list[str] | None
|
||||
lora_dropout: float | None = 0.0
|
||||
# The layer indices to transform, otherwise, apply to all layers
|
||||
peft_layers_to_transform: list[int] | None
|
||||
peft_layers_pattern: list[str] | None
|
||||
|
||||
peft:
|
||||
# Configuration options for loftq initialization for LoRA
|
||||
loftq_config:
|
||||
# typically 4 bits
|
||||
loftq_bits: int = 4
|
||||
|
||||
# Whether to use DoRA.
|
||||
peft_use_dora: bool | None
|
||||
# Whether to use RSLoRA.
|
||||
peft_use_rslora: bool | None
|
||||
# List of layer indices to replicate.
|
||||
peft_layer_replication: list[tuple[int, int]] | None
|
||||
# How to initialize LoRA weights. Default to True which is MS original implementation.
|
||||
peft_init_lora_weights: bool | str | None
|
||||
|
||||
# load qlora model in sharded format for FSDP using answer.ai technique.
|
||||
qlora_sharded_model_loading: bool | None = False
|
||||
# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it
|
||||
# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
|
||||
lora_on_cpu: bool | None
|
||||
# Whether you are training a 4-bit GPTQ quantized model
|
||||
gptq: bool | None
|
||||
# optional overrides to the bnb 4bit quantization configuration
|
||||
bnb_config_kwargs: dict[str, Any] | None
|
||||
|
||||
# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
|
||||
loraplus_lr_ratio: float | None
|
||||
# loraplus learning rate for lora embedding layers. Default value is 1e-6.
|
||||
loraplus_lr_embedding: float | None = 1e-06
|
||||
|
||||
merge_lora: bool | None
|
||||
|
||||
# Number of steps per ReLoRA restart
|
||||
relora_steps: int | None
|
||||
# Number of per-restart warmup steps
|
||||
relora_warmup_steps: int | None
|
||||
# Number of anneal steps for each relora cycle
|
||||
relora_anneal_steps: int | None
|
||||
# threshold for optimizer magnitude when pruning
|
||||
relora_prune_ratio: float | None
|
||||
# True to perform lora weight merges on cpu during restarts, for modest gpu memory
|
||||
# savings
|
||||
relora_cpu_offload: bool | None
|
||||
|
||||
# If greater than 1, backpropagation will be skipped and the gradients will be
|
||||
# accumulated for the given number of steps.
|
||||
gradient_accumulation_steps: int | None = 1
|
||||
# The number of samples to include in each batch. This is the number of samples sent to
|
||||
# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps
|
||||
micro_batch_size: int | None = 1
|
||||
# Total batch size, we do not recommended setting this manually
|
||||
batch_size: int | None
|
||||
# per gpu micro batch size for evals, defaults to value of micro_batch_size
|
||||
eval_batch_size: int | None
|
||||
|
||||
# whether to find batch size that fits in memory. Passed to underlying transformers
|
||||
# Trainer
|
||||
auto_find_batch_size: bool | None
|
||||
|
||||
# Whether to mask out or include the human's prompt from the training labels
|
||||
train_on_inputs: bool | None = False
|
||||
# Group similarly sized data to minimize padding. May be slower to start, as it must
|
||||
# download and sort the entire dataset. Note that training loss may have an oscillating
|
||||
# pattern with this enabled.
|
||||
group_by_length: bool | None
|
||||
|
||||
learning_rate: str | float (required)
|
||||
embedding_lr: float | None
|
||||
embedding_lr_scale: float | None
|
||||
# Specify weight decay
|
||||
weight_decay: float | None = 0.0
|
||||
# Specify optimizer
|
||||
optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED
|
||||
# Dictionary of arguments to pass to the optimizer
|
||||
optim_args: str | dict[str, Any] | None
|
||||
# The target modules to optimize, i.e. the module names that you would like to train,
|
||||
# right now this is used only for GaLore algorithm
|
||||
optim_target_modules: list[str] | Literal['all_linear'] | None
|
||||
# Path to torch distx for optim 'adamw_anyprecision'
|
||||
torchdistx_path: str | None
|
||||
lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE
|
||||
# Specify a scheduler and kwargs to use with the optimizer
|
||||
lr_scheduler_kwargs: dict[str, Any] | None
|
||||
lr_quadratic_warmup: bool | None
|
||||
# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of
|
||||
# peak lr
|
||||
cosine_min_lr_ratio: float | None
|
||||
# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means
|
||||
# start cosine_min_lr at 80% of training step
|
||||
cosine_constant_lr_ratio: float | None
|
||||
# Learning rate div factor
|
||||
lr_div_factor: float | None
|
||||
lr_groups: list[LrGroup] | None
|
||||
|
||||
# adamw hyperparams
|
||||
adam_epsilon: float | None
|
||||
# only used for CAME Optimizer
|
||||
adam_epsilon2: float | None
|
||||
# adamw hyperparams
|
||||
adam_beta1: float | None
|
||||
# adamw hyperparams
|
||||
adam_beta2: float | None
|
||||
# only used for CAME Optimizer
|
||||
adam_beta3: float | None
|
||||
# Gradient clipping max norm
|
||||
max_grad_norm: float | None
|
||||
num_epochs: float = 1.0
|
||||
|
||||
use_wandb: bool | None
|
||||
# Set the name of your wandb run
|
||||
wandb_name: str | None
|
||||
# Set the ID of your wandb run
|
||||
wandb_run_id: str | None
|
||||
# "offline" to save run metadata locally and not sync to the server, "disabled" to turn
|
||||
# off wandb
|
||||
wandb_mode: str | None
|
||||
# Your wandb project name
|
||||
wandb_project: str | None
|
||||
# A wandb Team name if using a Team
|
||||
wandb_entity: str | None
|
||||
wandb_watch: str | None
|
||||
# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only
|
||||
# at the end of training
|
||||
wandb_log_model: str | None
|
||||
|
||||
use_mlflow: bool | None
|
||||
# URI to mlflow
|
||||
mlflow_tracking_uri: str | None
|
||||
# Your experiment name
|
||||
mlflow_experiment_name: str | None
|
||||
# Your run name
|
||||
mlflow_run_name: str | None
|
||||
# set to true to copy each saved checkpoint on each save to mlflow artifact registry
|
||||
hf_mlflow_log_artifacts: bool | None
|
||||
|
||||
# Enable or disable Comet integration.
|
||||
use_comet: bool | None
|
||||
# API key for Comet. Recommended to set via `comet login`.
|
||||
comet_api_key: str | None
|
||||
# Workspace name in Comet. Defaults to the user's default workspace.
|
||||
comet_workspace: str | None
|
||||
# Project name in Comet. Defaults to Uncategorized.
|
||||
comet_project_name: str | None
|
||||
# Identifier for the experiment. Used to append data to an existing experiment or
|
||||
# control the key of new experiments. Default to a random key.
|
||||
comet_experiment_key: str | None
|
||||
# Create a new experiment ("create") or log to an existing one ("get"). Default
|
||||
# ("get_or_create") auto-selects based on configuration.
|
||||
comet_mode: str | None
|
||||
# Set to True to log data to Comet server, or False for offline storage. Default is
|
||||
# True.
|
||||
comet_online: bool | None
|
||||
# Dictionary for additional configuration settings, see the doc for more details.
|
||||
comet_experiment_config: dict[str, Any] | None
|
||||
|
||||
# the number of activate layers in LISA
|
||||
lisa_n_layers: int | None
|
||||
# how often to switch layers in LISA
|
||||
lisa_step_interval: int | None
|
||||
# path under the model to access the layers
|
||||
lisa_layers_attribute: str | None = model.layers
|
||||
|
||||
gradio_title: str | None
|
||||
gradio_share: bool | None
|
||||
gradio_server_name: str | None
|
||||
gradio_server_port: int | None
|
||||
gradio_max_new_tokens: int | None
|
||||
gradio_temperature: float | None
|
||||
|
||||
use_ray: bool = False
|
||||
ray_run_name: str | None
|
||||
ray_num_workers: int = 1
|
||||
resources_per_worker: dict
|
||||
|
||||
# The size of the image to resize to. It can be an integer (resized into padded-square
|
||||
# image) or a tuple (width, height).If not provided, we will attempt to load from
|
||||
# preprocessor.size, otherwise, images won't be resized.
|
||||
image_size: int | tuple[int, int] | None
|
||||
# The resampling algorithm to use for image resizing. Default is bilinear. Please refer
|
||||
# to PIL.Image.Resampling for more details.
|
||||
image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None
|
||||
|
||||
# optional overrides to the base model configuration
|
||||
overrides_of_model_config: dict[str, Any] | None
|
||||
# optional overrides the base model loading from_pretrained
|
||||
overrides_of_model_kwargs: dict[str, Any] | None
|
||||
# If you want to specify the type of model to load, AutoModelForCausalLM is a good
|
||||
# choice too
|
||||
type_of_model: str | None
|
||||
# You can specify to choose a specific model revision from huggingface hub
|
||||
revision_of_model: str | None
|
||||
|
||||
max_packed_sequence_len: int | None
|
||||
rope_scaling: Any | None
|
||||
noisy_embedding_alpha: float | None
|
||||
dpo_beta: float | None
|
||||
evaluation_strategy: str | None
|
||||
```
|
||||
Reference in New Issue
Block a user