diff --git a/cicd/e2e_tests.py b/cicd/e2e_tests.py index ce9c605c7..610e3730d 100644 --- a/cicd/e2e_tests.py +++ b/cicd/e2e_tests.py @@ -1,5 +1,7 @@ """Modal app to run axolotl GPU tests""" +import pathlib + from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd @@ -12,9 +14,21 @@ from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd volumes=VOLUME_CONFIG, ) def cicd_pytest(): + run_cmd("./cicd/cicd.sh", "/workspace/axolotl") + # Read the coverage file if it exists + coverage_file = pathlib.Path("/workspace/axolotl/e2e-coverage.xml") + if coverage_file.exists(): + return coverage_file.read_text(encoding="utf-8") + return None + @app.local_entrypoint() def main(): - cicd_pytest.remote() + coverage = cicd_pytest.remote() + + # Save the coverage file to the local filesystem if it was generated + if coverage: + with open("e2e-coverage.xml", "w", encoding="utf-8") as f: + f.write(coverage) diff --git a/cicd/multigpu.py b/cicd/multigpu.py index a2dd8d0b3..f028d0f68 100644 --- a/cicd/multigpu.py +++ b/cicd/multigpu.py @@ -77,7 +77,18 @@ def run_cmd(cmd: str, run_folder: str): def cicd_pytest(): run_cmd("./cicd/multigpu.sh", "/workspace/axolotl") + # Read the coverage file if it exists + coverage_file = pathlib.Path("/workspace/axolotl/multigpu-coverage.xml") + if coverage_file.exists(): + return coverage_file.read_text(encoding="utf-8") + return None + @app.local_entrypoint() def main(): - cicd_pytest.remote() + coverage = cicd_pytest.remote() + + # Save the coverage file to the local filesystem if it was generated + if coverage: + with open("multigpu-coverage.xml", "w", encoding="utf-8") as file: + file.write(coverage) diff --git a/docs/config-reference.qmd b/docs/config-reference.qmd new file mode 100644 index 000000000..d5fc65b0d --- /dev/null +++ b/docs/config-reference.qmd @@ -0,0 +1,780 @@ +--- +title: Config Reference +description: A complete list of all configuration options. +--- + +```yaml +# Allow overwrite yml config using from cli +strict: bool | None = False +# Resume from a specific checkpoint dir +resume_from_checkpoint: str | None +# If resume_from_checkpoint isn't set and you simply want it to start where it left off. +# Be careful with this being turned on between different models. +auto_resume_from_checkpoints: bool | None +# Resize the model embeddings when new tokens are added to multiples of 32. This is +# reported to improve training speed on some models +resize_token_embeddings_to_32x: bool | None +mean_resizing_embeddings: bool | None = False + +# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink. +shrink_embeddings: bool | None +# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs +embeddings_skip_upcast: bool | None + +# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo' +rl: RLType | None + +trl: + # Beta parameter for the RL training. Same as `rl_beta`. Use + beta: float | None + # Maximum length of the completion for RL training. + max_completion_length: int | None + + # Whether to use VLLM for RL training. + use_vllm: bool = False + # Host of the vLLM server to connect to. + vllm_server_host: str | None = 0.0.0.0 + # Port of the vLLM server to connect to. + vllm_server_port: int | None = 8000 + # Total timeout (in seconds) to wait for the vLLM server to respond. + vllm_server_timeout: int | None + # Regex for vLLM guided decoding. + vllm_guided_decoding_regex: str | None + + # List of reward functions to load. Paths must be importable from current dir. + reward_funcs: list[str] | None + # List of reward weights for the reward functions. + reward_weights: list[float] | None + # Number of generations to sample. + num_generations: int | None + # Whether to log completions. + log_completions: bool | None = False + # Number of completions to print when log_completions is True. + num_completions_to_print: int | None + # Whether to sync the reference model. + sync_ref_model: bool | None = False + # Mixup alpha for the reference model. + ref_model_mixup_alpha: float | None = 0.9 + # Sync steps for the reference model. + ref_model_sync_steps: int | None = 64 + # Whether to scale rewards by their standard deviation. + scale_rewards: bool = True + + # Sampling temperature for the GRPO policy. + temperature: float | None + # Top-p sampling probability for the generation policy. + top_p: float | None + # Top-k sampling for the generation policy. + top_k: int | None + # Minimum probability for the generation policy. + min_p: float | None + # Penalty for tokens that appear in prompt and generated text. + repetition_penalty: float | None + # Number of iterations per batch (μ) for GRPO. + num_iterations: int | None + # Epsilon value for clipping in the GRPO algorithm. + epsilon: float | None + # Upper-bound epsilon value for clipping in the GRPO algorithm. + epsilon_high: float | None + # Whether to use Liger loss for GRPO. + use_liger_loss: bool | None + # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo. + loss_type: str | None + # Whether to exclude truncated completions from loss calculation. + mask_truncated_completions: bool = False + +vllm: + # Device to use for VLLM + device: str | None = auto + # Tensor parallel size for VLLM + tensor_parallel_size: int | None + # GPU memory utilization for VLLM + gpu_memory_utilization: float | None = 0.9 + # Data type for VLLM + dtype: str | None = auto + # Maximum length of the model context for VLLM + max_model_len: int | None + # Enable prefix caching for VLLM + enable_prefix_caching: bool | None + # Host for the vLLM server to start on + host: str | None = 0.0.0.0 + # Port of the vLLM server to start on + port: int | None = 8000 + + # Enable reasoning for VLLM + enable_reasoning: bool | None + # Reasoning parser for VLLM + reasoning_parser: str | None + +qat: + # Fake quantization layout to use for activation quantization. Valid options are + # "int4" and "int8" + activation_dtype: TorchIntDType | None + # Fake quantization layout to use for weight quantization. Valid options are "int4" + # and "int8" + weight_dtype: TorchIntDType = TorchIntDType.int8 + # Quantize embedding + quantize_embedding: bool | None = False + # The number of elements in each group for per-group fake quantization + group_size: int | None = 32 + # The number of steps to apply fake quantization after + fake_quant_after_n_steps: int | None + +quantization: + # Fake quantization layout to use for weight quantization. Valid options are uintX for + # X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8 + weight_dtype: TorchIntDType = TorchIntDType.int8 + # Fake quantization layout to use for activation quantization. Valid options are + # "int4" and "int8" + activation_dtype: TorchIntDType | None + # Whether to quantize the embedding layer. + quantize_embedding: bool | None + # The number of elements in each group for per-group fake quantization + group_size: int | None = 32 + +# Reward modelling: `True` or `False` +reward_model: bool | None +# Process reward modelling: `True` or `False` +process_reward_model: bool | None +num_labels: int | None + +# Whether to perform weighting in DPO trainer +dpo_use_weighting: bool | None +dpo_use_logits_to_keep: bool | None +dpo_label_smoothing: float | None +dpo_norm_loss: bool | None +dpo_padding_free: bool | None + +# A list of one or more datasets to finetune the model with +datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None + +# A list of one or more datasets to eval the model with. You can use either +# test_datasets, or val_set_size, but not both. +test_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None +# If false, the datasets will not be shuffled and will keep their original order in +# `datasets`. The same applies to the `test_datasets` option and the +# `pretraining_dataset` option. Default is true. +shuffle_merged_datasets: bool | None = True +# Axolotl attempts to save the dataset as an arrow after packing the data together so +# subsequent training attempts load faster, relative path +dataset_prepared_path: str | None +# Num shards for whole dataset +dataset_shard_num: int | None +# Index of shard to use for whole dataset +dataset_shard_idx: int | None +skip_prepare_dataset: bool | None = False + +# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize +pretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None +# The maximum number of processes to use while preprocessing your input dataset. This +# defaults to `os.cpu_count()` if not set. +dataset_processes: int | None = 14 +# Deduplicates datasets and test_datasets with identical entries +dataset_exact_deduplication: bool | None +# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking +# too much storage +dataset_keep_in_memory: bool | None +dataloader_pin_memory: bool | None +dataloader_num_workers: int | None +dataloader_prefetch_factor: int | None +dataloader_drop_last: bool | None + +accelerator_config: dict[str, Any] | None + +remove_unused_columns: bool | None + +# Push prepared dataset to hub - repo_org/repo_name +push_dataset_to_hub: str | None +# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private +# datasets. Required to be true when used in combination with `push_dataset_to_hub` +hf_use_auth_token: bool | None + +device: Any | None +# Passed through to transformers when loading the model when launched without +# accelerate. Use `sequential` when training w/ model parallelism to limit memory +device_map: Any | None +world_size: int | None +# Don't mess with this, it's here for accelerate and torchrun +local_rank: int | None +ddp: bool | None + +# Seed for reproducibility +seed: int | None +# Advanced DDP Arguments - timeout +ddp_timeout: int | None +# Advanced DDP Arguments - bucket cap in MB +ddp_bucket_cap_mb: int | None +# Advanced DDP Arguments - broadcast buffers +ddp_broadcast_buffers: bool | None +ddp_find_unused_parameters: bool | None + +# Approximate number of predictions sent to wandb depending on batch size. Enabled above +# 0. Default is 0 +eval_table_size: int | None +# Total number of tokens generated for predictions sent to wandb. Default is 128 +eval_max_new_tokens: int | None +# Whether to run causal language model evaluation for metrics in +# `eval_causal_lm_metrics` +do_causal_lm_eval: bool | None +# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', +# 'chrf', 'perplexity'] +eval_causal_lm_metrics: list[str] | None +do_bench_eval: bool | None +bench_dataset: str | None +bench_split: str | None +metric_for_best_model: str | None +greater_is_better: bool | None + +# High loss value, indicating the learning has broken down (a good estimate is ~2 times +# the loss at the start of training) +loss_watchdog_threshold: float | None +# Number of high-loss steps in a row before the trainer aborts (default: 3) +loss_watchdog_patience: int | None + +gc_steps: int | None + +# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. +# require >=ampere +bf16: Literal['auto'] | bool | None = auto +# Use CUDA fp16 +fp16: bool | None +fp8: bool | None +# No AMP (automatic mixed precision) - require >=ampere +bfloat16: bool | None +# No AMP (automatic mixed precision) +float16: bool | None +# Use CUDA tf32 - require >=ampere +tf32: bool | None +float32: bool | None + +# Whether to use gradient checkpointing. Available options are: true, false, 'offload', +# 'offload_disk'. +# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing +gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False +# Additional kwargs to pass to the trainer for gradient checkpointing +gradient_checkpointing_kwargs: dict[str, Any] | None + +unfrozen_parameters: list[str] | None + +# The maximum length of an input to train with, this should typically be less than 2048 +# as most models have a token/context limit of 2048 +sequence_len: int = 512 +min_sample_len: int | None +# maximum prompt length for RL training +max_prompt_len: int = 512 +# Use efficient multi-packing with block diagonal attention and per sequence +# position_ids. Recommend set to 'true' +sample_packing: bool | None +# The number of samples packed at a time. Increasing the following values helps with +# packing, but usually only slightly (<%1.) +sample_packing_group_size: int | None = 100000 +# The number of samples which can be packed into one sequence. Increase if using a large +# sequence_len with many short samples. +sample_packing_bin_size: int | None = 200 +# Whether to pack samples sequentially +sample_packing_sequentially: bool | None +# Set to 'false' if getting errors during eval with sample_packing on +eval_sample_packing: bool | None +# Pad inputs so each step uses constant sized buffers. This will reduce memory +# fragmentation and may prevent OOMs, by re-using memory more efficiently +pad_to_sequence_len: bool | None +# Whether to use sequential sampling for curriculum learning +curriculum_sampling: bool | None +multipack_real_batches: bool | None +# whether to concatenate samples during pretraining +pretraining_sample_concatenation: bool | None + +# Use batch flattening for speedups when not using sample_packing +batch_flattening: Literal['auto'] | bool | None + +use_pose: bool | None +pose_split_on_token_ids: list[int] | None +pose_max_context_len: int | None +pose_num_chunks: int | None + +pretrain_multipack_buffer_size: int | None = 10000 +# whether to prevent cross attention for packed sequences during pretraining +pretrain_multipack_attn: bool | None = True + +# Whether to use xformers attention patch https://github.com/facebookresearch/xformers +xformers_attention: bool | None +# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/ +# torch.nn.functional.scaled_dot_product_attention.html +sdp_attention: bool | None +# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf +s2_attention: bool | None +flex_attention: bool | None +flex_attn_compile_kwargs: dict[str, Any] | None +# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention +flash_attention: bool | None +# Whether to use flash-attention cross entropy implementation - advanced use only +flash_attn_cross_entropy: bool | None +# Whether to use flash-attention rms norm implementation - advanced use only +flash_attn_rms_norm: bool | None +# Whether to fuse QKV into a single operation +flash_attn_fuse_qkv: bool | None +# Whether to fuse part of the MLP into a single operation +flash_attn_fuse_mlp: bool | None +# Whether to use bettertransformers +flash_optimum: bool | None + +eager_attention: bool | None + +unsloth_cross_entropy_loss: bool | None +unsloth_lora_mlp: bool | None +unsloth_lora_qkv: bool | None +unsloth_lora_o: bool | None +unsloth_rms_norm: bool | None +unsloth_rope: bool | None + +# Apply custom LoRA autograd functions and activation function Triton kernels for speed +# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html +lora_mlp_kernel: bool | None +# Apply custom LoRA autograd functions and activation function Triton kernels for speed +# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html +lora_qkv_kernel: bool | None +# Apply custom LoRA autograd functions and activation function Triton kernels for speed +# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html +lora_o_kernel: bool | None + +llama4_linearized_experts: bool | None + +# Deepspeed config path. e.g., deepspeed_configs/zero3.json +deepspeed: str | dict[str, Any] | None +# FSDP configuration +fsdp: list[str] | None +# FSDP configuration options +fsdp_config: dict[str, Any] | None +fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None + +# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for +# no eval. +val_set_size: float | None = 0.0 + +# Set to a divisor of the number of GPUs available to split sequences into chunks of +# equal size. Use in long context training to prevent OOM when sequences cannot fit into +# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each +# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized +# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more +# details. +sequence_parallel_degree: int | None +# Optional; strides across the key dimension. Larger values use more memory but should +# make training faster. Must evenly divide the number of KV heads in your model. +heads_k_stride: int | None +# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to +# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing +# case. +ring_attn_func: RingAttnFunc | None + +# Add or change special tokens. If you add tokens here, you don't need to add them to +# the `tokens` list. +special_tokens: + bos_token: str | None + eos_token: str | None + pad_token: str | None + unk_token: str | None + additional_special_tokens: list[str] | None + +# Add extra tokens to the tokenizer +tokens: list[str] | None +# Mapping token_id to new_token_string to override reserved added_tokens in the +# tokenizer. Only works for tokens that are not part of the base vocab (aka are +# added_tokens). Can be checked if they exist in tokenizer.json added_tokens. +added_tokens_overrides: dict[int, str] | None + +# Whether to use torch.compile and which backend to use. setting to `auto` will enable +# torch compile when torch>=2.5.1 +torch_compile: Literal['auto'] | bool | None +# Backend to use for torch.compile +torch_compile_backend: str | None +torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None + +# Maximum number of iterations to train for. It precedes num_epochs which means that if +# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => +# `num_epochs: 2` and `max_steps: 100` will train for 100 steps +max_steps: int | None +# Number of warmup steps. Cannot use with warmup_ratio +warmup_steps: int | None +# Warmup ratio. Cannot use with warmup_steps +warmup_ratio: float | None +# Leave empty to eval at each epoch, integer for every N steps. float for fraction of +# total steps +eval_steps: int | float | None +# Number of times per epoch to run evals, mutually exclusive with eval_steps +evals_per_epoch: int | None +# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer +# from `eval_steps` +eval_strategy: str | None +# Leave empty to save at each epoch, integer for every N steps. float for fraction of +# total steps +save_steps: int | float | None +# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps +saves_per_epoch: int | None +# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better +# result is achieved, leave empty to infer from `save_steps` +save_strategy: str | None +# Checkpoints saved at a time +save_total_limit: int | None +# Logging frequency +logging_steps: int | None +# Stop training after this many evaluation losses have increased in a row. https://huggi +# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin +# gCallback +early_stopping_patience: int | None +load_best_model_at_end: bool | None = False +# Save only the model weights, skipping the optimizer. Using this means you can't resume +# from checkpoints. +save_only_model: bool | None = False +# Use tensorboard for logging +use_tensorboard: bool | None +# Enable the pytorch profiler to capture the first N steps of training to the +# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more +# information. Snapshots can be visualized @ https://pytorch.org/memory_viz +profiler_steps: int | None +# bool of whether to include tokens trainer per second in the training metrics. This +# iterates over the entire dataset once, so it takes some time. +include_tokens_per_second: bool | None + +# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to +# add noise to embeddings. Currently only supported on Llama and Mistral +neftune_noise_alpha: float | None + +# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to +# `beta` in `ORPOConfig` due to trl mapping. +orpo_alpha: float | None +# Weighting of NLL term in loss from RPO paper +rpo_alpha: float | None +# Target reward margin for the SimPO loss +simpo_gamma: float | None +# Weight of the BC regularizer +cpo_alpha: float | None + +# Factor for desirable loss term in KTO loss +kto_desirable_weight: float | None +# Factor for undesirable loss term in KTO loss +kto_undesirable_weight: float | None +# The beta parameter for the RL training +rl_beta: float | None + +# Defines the max memory usage per gpu on the system. Passed through to transformers +# when loading the model. +max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None +# Limit the memory for all available GPUs to this amount (if an integer, expressed in +# gigabytes); default: unset +gpu_memory_limit: int | str | None +# Whether to use low_cpu_mem_usage +low_cpu_mem_usage: bool | None + +# The name of the chat template to use for training, following values are supported: +# tokenizer_default: Uses the chat template that is available in the +# tokenizer_config.json. If the chat template is not available in the tokenizer, it will +# raise an error. This is the default value. +# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates +# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. +# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. +# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not +# available in the tokenizer. jinja: Uses a custom jinja template for the chat template. +# The custom jinja template should be provided in the chat_template_jinja field. The +# selected chat template will be saved to the tokenizer_config.json for easier +# inferencing +chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None +# Custom jinja template for chat template. This will be only used if chat_template is +# set to `jinja` or `null` (in which case chat_template is automatically set to +# `jinja`). Default is null. +chat_template_jinja: str | None +# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the +# boundaries between conversation turns. For example: ['/INST', '', +# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is +# useful for templates that use multiple delimiter tokens. +eot_tokens: list[str] | None +# Changes the default system message. Currently only supports chatml. +default_system_message: str | None + +fix_untrained_tokens: int | list[int] | None + +is_preprocess: bool | None +preprocess_iterable: bool | None + +# Total number of tokens - internal use +total_num_tokens: int | None +total_supervised_tokens: int | None +# You can set these packing optimizations AFTER starting a training at least once. The +# trainer will provide recommended values for these values. +sample_packing_eff_est: float | None +axolotl_config_path: str | None + +# Internal use only - Used to identify which the model is based on +is_falcon_derived_model: bool | None +# Internal use only - Used to identify which the model is based on +is_llama_derived_model: bool | None +# Internal use only - Used to identify which the model is based on. Please note that if +# you set this to true, `padding_side` will be set to 'left' by default +is_mistral_derived_model: bool | None +# Internal use only - Used to identify which the model is based on +is_qwen_derived_model: bool | None + +# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available +# plugins or doc below for more details. +# https://docs.axolotl.ai/docs/custom_integrations.html +plugins: list[str] | None + +# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This +# can also be a relative path to a model on disk +base_model: str (required) +# If the base_model repo on hf hub doesn't include configuration .json files, You can +# set that here, or leave this empty to default to base_model +base_model_config: str | None +cls_model_config: str | None +# Optional tokenizer configuration path in case you want to use a different tokenizer +# than the one defined in the base model +tokenizer_config: str | None +# use_fast option for tokenizer loading from_pretrained, default to True +tokenizer_use_fast: bool | None +# Whether to use the legacy tokenizer setting, defaults to True +tokenizer_legacy: bool | None +# Whether to use mistral-common tokenizer. If set to True, it will use the mistral- +# common tokenizer. +tokenizer_use_mistral_common: bool | None +# Corresponding tokenizer for the model AutoTokenizer is a good choice +tokenizer_type: str | None +# transformers processor class +processor_type: str | None +# Trust remote code for untrusted source +trust_remote_code: bool | None + +# Where to save the full-finetuned model to +output_dir: str = ./model-out +# push checkpoints to hub +hub_model_id: str | None +# how to push checkpoints to hub +hub_strategy: str | None +# Save model as safetensors (require safetensors package). Default True +save_safetensors: bool | None = True + +# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer +load_in_8bit: bool | None = False +# Use bitsandbytes 4 bit +load_in_4bit: bool | None = False + +# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in +# original model +adapter: str | None +# If you already have a lora model trained that you want to load, put that here. This +# means after training, if you want to test the model, you should set this to the value +# of `output_dir`. Note that if you merge an adapter to the base model, a new +# subdirectory `merged` will be created under the `output_dir`. +lora_model_dir: str | None +lora_r: int | None +lora_alpha: int | None +lora_fan_in_fan_out: bool | None +lora_target_modules: str | list[str] | None +# If true, will target all linear modules +lora_target_linear: bool | None +# If you added new tokens to the tokenizer, you may need to save some LoRA modules +# because they need to know the new tokens. For LLaMA and Mistral, you need to save +# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts +# tokens to embeddings, and `lm_head` converts embeddings to token probabilities. +lora_modules_to_save: list[str] | None +lora_dropout: float | None = 0.0 +# The layer indices to transform, otherwise, apply to all layers +peft_layers_to_transform: list[int] | None +peft_layers_pattern: list[str] | None + +peft: + # Configuration options for loftq initialization for LoRA + loftq_config: + # typically 4 bits + loftq_bits: int = 4 + +# Whether to use DoRA. +peft_use_dora: bool | None +# Whether to use RSLoRA. +peft_use_rslora: bool | None +# List of layer indices to replicate. +peft_layer_replication: list[tuple[int, int]] | None +# How to initialize LoRA weights. Default to True which is MS original implementation. +peft_init_lora_weights: bool | str | None + +# load qlora model in sharded format for FSDP using answer.ai technique. +qlora_sharded_model_loading: bool | None = False +# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it +# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge +lora_on_cpu: bool | None +# Whether you are training a 4-bit GPTQ quantized model +gptq: bool | None +# optional overrides to the bnb 4bit quantization configuration +bnb_config_kwargs: dict[str, Any] | None + +# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4. +loraplus_lr_ratio: float | None +# loraplus learning rate for lora embedding layers. Default value is 1e-6. +loraplus_lr_embedding: float | None = 1e-06 + +merge_lora: bool | None + +# Number of steps per ReLoRA restart +relora_steps: int | None +# Number of per-restart warmup steps +relora_warmup_steps: int | None +# Number of anneal steps for each relora cycle +relora_anneal_steps: int | None +# threshold for optimizer magnitude when pruning +relora_prune_ratio: float | None +# True to perform lora weight merges on cpu during restarts, for modest gpu memory +# savings +relora_cpu_offload: bool | None + +# If greater than 1, backpropagation will be skipped and the gradients will be +# accumulated for the given number of steps. +gradient_accumulation_steps: int | None = 1 +# The number of samples to include in each batch. This is the number of samples sent to +# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps +micro_batch_size: int | None = 1 +# Total batch size, we do not recommended setting this manually +batch_size: int | None +# per gpu micro batch size for evals, defaults to value of micro_batch_size +eval_batch_size: int | None + +# whether to find batch size that fits in memory. Passed to underlying transformers +# Trainer +auto_find_batch_size: bool | None + +# Whether to mask out or include the human's prompt from the training labels +train_on_inputs: bool | None = False +# Group similarly sized data to minimize padding. May be slower to start, as it must +# download and sort the entire dataset. Note that training loss may have an oscillating +# pattern with this enabled. +group_by_length: bool | None + +learning_rate: str | float (required) +embedding_lr: float | None +embedding_lr_scale: float | None +# Specify weight decay +weight_decay: float | None = 0.0 +# Specify optimizer +optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED +# Dictionary of arguments to pass to the optimizer +optim_args: str | dict[str, Any] | None +# The target modules to optimize, i.e. the module names that you would like to train, +# right now this is used only for GaLore algorithm +optim_target_modules: list[str] | Literal['all_linear'] | None +# Path to torch distx for optim 'adamw_anyprecision' +torchdistx_path: str | None +lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE +# Specify a scheduler and kwargs to use with the optimizer +lr_scheduler_kwargs: dict[str, Any] | None +lr_quadratic_warmup: bool | None +# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of +# peak lr +cosine_min_lr_ratio: float | None +# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means +# start cosine_min_lr at 80% of training step +cosine_constant_lr_ratio: float | None +# Learning rate div factor +lr_div_factor: float | None +lr_groups: list[LrGroup] | None + +# adamw hyperparams +adam_epsilon: float | None +# only used for CAME Optimizer +adam_epsilon2: float | None +# adamw hyperparams +adam_beta1: float | None +# adamw hyperparams +adam_beta2: float | None +# only used for CAME Optimizer +adam_beta3: float | None +# Gradient clipping max norm +max_grad_norm: float | None +num_epochs: float = 1.0 + +use_wandb: bool | None +# Set the name of your wandb run +wandb_name: str | None +# Set the ID of your wandb run +wandb_run_id: str | None +# "offline" to save run metadata locally and not sync to the server, "disabled" to turn +# off wandb +wandb_mode: str | None +# Your wandb project name +wandb_project: str | None +# A wandb Team name if using a Team +wandb_entity: str | None +wandb_watch: str | None +# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only +# at the end of training +wandb_log_model: str | None + +use_mlflow: bool | None +# URI to mlflow +mlflow_tracking_uri: str | None +# Your experiment name +mlflow_experiment_name: str | None +# Your run name +mlflow_run_name: str | None +# set to true to copy each saved checkpoint on each save to mlflow artifact registry +hf_mlflow_log_artifacts: bool | None + +# Enable or disable Comet integration. +use_comet: bool | None +# API key for Comet. Recommended to set via `comet login`. +comet_api_key: str | None +# Workspace name in Comet. Defaults to the user's default workspace. +comet_workspace: str | None +# Project name in Comet. Defaults to Uncategorized. +comet_project_name: str | None +# Identifier for the experiment. Used to append data to an existing experiment or +# control the key of new experiments. Default to a random key. +comet_experiment_key: str | None +# Create a new experiment ("create") or log to an existing one ("get"). Default +# ("get_or_create") auto-selects based on configuration. +comet_mode: str | None +# Set to True to log data to Comet server, or False for offline storage. Default is +# True. +comet_online: bool | None +# Dictionary for additional configuration settings, see the doc for more details. +comet_experiment_config: dict[str, Any] | None + +# the number of activate layers in LISA +lisa_n_layers: int | None +# how often to switch layers in LISA +lisa_step_interval: int | None +# path under the model to access the layers +lisa_layers_attribute: str | None = model.layers + +gradio_title: str | None +gradio_share: bool | None +gradio_server_name: str | None +gradio_server_port: int | None +gradio_max_new_tokens: int | None +gradio_temperature: float | None + +use_ray: bool = False +ray_run_name: str | None +ray_num_workers: int = 1 +resources_per_worker: dict + +# The size of the image to resize to. It can be an integer (resized into padded-square +# image) or a tuple (width, height).If not provided, we will attempt to load from +# preprocessor.size, otherwise, images won't be resized. +image_size: int | tuple[int, int] | None +# The resampling algorithm to use for image resizing. Default is bilinear. Please refer +# to PIL.Image.Resampling for more details. +image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None + +# optional overrides to the base model configuration +overrides_of_model_config: dict[str, Any] | None +# optional overrides the base model loading from_pretrained +overrides_of_model_kwargs: dict[str, Any] | None +# If you want to specify the type of model to load, AutoModelForCausalLM is a good +# choice too +type_of_model: str | None +# You can specify to choose a specific model revision from huggingface hub +revision_of_model: str | None + +max_packed_sequence_len: int | None +rope_scaling: Any | None +noisy_embedding_alpha: float | None +dpo_beta: float | None +evaluation_strategy: str | None +```