From 016eb8055f21df07da61cbdc8c6fbdacdf5907d1 Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Tue, 17 Jun 2025 13:58:02 -0400 Subject: [PATCH] accidental file --- .github/workflows/tests.yml | 1 - docs/config-reference.qmd | 780 ------------------------------------ 2 files changed, 781 deletions(-) delete mode 100644 docs/config-reference.qmd diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 73d9a3335..e519314b3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -371,7 +371,6 @@ jobs: directory: coverage-reports fail_ci_if_error: false verbose: true - carryforward: true name: codecov-umbrella override_commit: ${{ github.event.pull_request.head.sha || github.sha }} override_pr: ${{ github.event.pull_request.number }} diff --git a/docs/config-reference.qmd b/docs/config-reference.qmd deleted file mode 100644 index d5fc65b0d..000000000 --- a/docs/config-reference.qmd +++ /dev/null @@ -1,780 +0,0 @@ ---- -title: Config Reference -description: A complete list of all configuration options. ---- - -```yaml -# Allow overwrite yml config using from cli -strict: bool | None = False -# Resume from a specific checkpoint dir -resume_from_checkpoint: str | None -# If resume_from_checkpoint isn't set and you simply want it to start where it left off. -# Be careful with this being turned on between different models. -auto_resume_from_checkpoints: bool | None -# Resize the model embeddings when new tokens are added to multiples of 32. This is -# reported to improve training speed on some models -resize_token_embeddings_to_32x: bool | None -mean_resizing_embeddings: bool | None = False - -# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink. -shrink_embeddings: bool | None -# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs -embeddings_skip_upcast: bool | None - -# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo' -rl: RLType | None - -trl: - # Beta parameter for the RL training. Same as `rl_beta`. Use - beta: float | None - # Maximum length of the completion for RL training. - max_completion_length: int | None - - # Whether to use VLLM for RL training. - use_vllm: bool = False - # Host of the vLLM server to connect to. - vllm_server_host: str | None = 0.0.0.0 - # Port of the vLLM server to connect to. - vllm_server_port: int | None = 8000 - # Total timeout (in seconds) to wait for the vLLM server to respond. - vllm_server_timeout: int | None - # Regex for vLLM guided decoding. - vllm_guided_decoding_regex: str | None - - # List of reward functions to load. Paths must be importable from current dir. - reward_funcs: list[str] | None - # List of reward weights for the reward functions. - reward_weights: list[float] | None - # Number of generations to sample. - num_generations: int | None - # Whether to log completions. - log_completions: bool | None = False - # Number of completions to print when log_completions is True. - num_completions_to_print: int | None - # Whether to sync the reference model. - sync_ref_model: bool | None = False - # Mixup alpha for the reference model. - ref_model_mixup_alpha: float | None = 0.9 - # Sync steps for the reference model. - ref_model_sync_steps: int | None = 64 - # Whether to scale rewards by their standard deviation. - scale_rewards: bool = True - - # Sampling temperature for the GRPO policy. - temperature: float | None - # Top-p sampling probability for the generation policy. - top_p: float | None - # Top-k sampling for the generation policy. - top_k: int | None - # Minimum probability for the generation policy. - min_p: float | None - # Penalty for tokens that appear in prompt and generated text. - repetition_penalty: float | None - # Number of iterations per batch (μ) for GRPO. - num_iterations: int | None - # Epsilon value for clipping in the GRPO algorithm. - epsilon: float | None - # Upper-bound epsilon value for clipping in the GRPO algorithm. - epsilon_high: float | None - # Whether to use Liger loss for GRPO. - use_liger_loss: bool | None - # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo. - loss_type: str | None - # Whether to exclude truncated completions from loss calculation. - mask_truncated_completions: bool = False - -vllm: - # Device to use for VLLM - device: str | None = auto - # Tensor parallel size for VLLM - tensor_parallel_size: int | None - # GPU memory utilization for VLLM - gpu_memory_utilization: float | None = 0.9 - # Data type for VLLM - dtype: str | None = auto - # Maximum length of the model context for VLLM - max_model_len: int | None - # Enable prefix caching for VLLM - enable_prefix_caching: bool | None - # Host for the vLLM server to start on - host: str | None = 0.0.0.0 - # Port of the vLLM server to start on - port: int | None = 8000 - - # Enable reasoning for VLLM - enable_reasoning: bool | None - # Reasoning parser for VLLM - reasoning_parser: str | None - -qat: - # Fake quantization layout to use for activation quantization. Valid options are - # "int4" and "int8" - activation_dtype: TorchIntDType | None - # Fake quantization layout to use for weight quantization. Valid options are "int4" - # and "int8" - weight_dtype: TorchIntDType = TorchIntDType.int8 - # Quantize embedding - quantize_embedding: bool | None = False - # The number of elements in each group for per-group fake quantization - group_size: int | None = 32 - # The number of steps to apply fake quantization after - fake_quant_after_n_steps: int | None - -quantization: - # Fake quantization layout to use for weight quantization. Valid options are uintX for - # X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8 - weight_dtype: TorchIntDType = TorchIntDType.int8 - # Fake quantization layout to use for activation quantization. Valid options are - # "int4" and "int8" - activation_dtype: TorchIntDType | None - # Whether to quantize the embedding layer. - quantize_embedding: bool | None - # The number of elements in each group for per-group fake quantization - group_size: int | None = 32 - -# Reward modelling: `True` or `False` -reward_model: bool | None -# Process reward modelling: `True` or `False` -process_reward_model: bool | None -num_labels: int | None - -# Whether to perform weighting in DPO trainer -dpo_use_weighting: bool | None -dpo_use_logits_to_keep: bool | None -dpo_label_smoothing: float | None -dpo_norm_loss: bool | None -dpo_padding_free: bool | None - -# A list of one or more datasets to finetune the model with -datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None - -# A list of one or more datasets to eval the model with. You can use either -# test_datasets, or val_set_size, but not both. -test_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None -# If false, the datasets will not be shuffled and will keep their original order in -# `datasets`. The same applies to the `test_datasets` option and the -# `pretraining_dataset` option. Default is true. -shuffle_merged_datasets: bool | None = True -# Axolotl attempts to save the dataset as an arrow after packing the data together so -# subsequent training attempts load faster, relative path -dataset_prepared_path: str | None -# Num shards for whole dataset -dataset_shard_num: int | None -# Index of shard to use for whole dataset -dataset_shard_idx: int | None -skip_prepare_dataset: bool | None = False - -# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize -pretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None -# The maximum number of processes to use while preprocessing your input dataset. This -# defaults to `os.cpu_count()` if not set. -dataset_processes: int | None = 14 -# Deduplicates datasets and test_datasets with identical entries -dataset_exact_deduplication: bool | None -# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking -# too much storage -dataset_keep_in_memory: bool | None -dataloader_pin_memory: bool | None -dataloader_num_workers: int | None -dataloader_prefetch_factor: int | None -dataloader_drop_last: bool | None - -accelerator_config: dict[str, Any] | None - -remove_unused_columns: bool | None - -# Push prepared dataset to hub - repo_org/repo_name -push_dataset_to_hub: str | None -# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private -# datasets. Required to be true when used in combination with `push_dataset_to_hub` -hf_use_auth_token: bool | None - -device: Any | None -# Passed through to transformers when loading the model when launched without -# accelerate. Use `sequential` when training w/ model parallelism to limit memory -device_map: Any | None -world_size: int | None -# Don't mess with this, it's here for accelerate and torchrun -local_rank: int | None -ddp: bool | None - -# Seed for reproducibility -seed: int | None -# Advanced DDP Arguments - timeout -ddp_timeout: int | None -# Advanced DDP Arguments - bucket cap in MB -ddp_bucket_cap_mb: int | None -# Advanced DDP Arguments - broadcast buffers -ddp_broadcast_buffers: bool | None -ddp_find_unused_parameters: bool | None - -# Approximate number of predictions sent to wandb depending on batch size. Enabled above -# 0. Default is 0 -eval_table_size: int | None -# Total number of tokens generated for predictions sent to wandb. Default is 128 -eval_max_new_tokens: int | None -# Whether to run causal language model evaluation for metrics in -# `eval_causal_lm_metrics` -do_causal_lm_eval: bool | None -# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', -# 'chrf', 'perplexity'] -eval_causal_lm_metrics: list[str] | None -do_bench_eval: bool | None -bench_dataset: str | None -bench_split: str | None -metric_for_best_model: str | None -greater_is_better: bool | None - -# High loss value, indicating the learning has broken down (a good estimate is ~2 times -# the loss at the start of training) -loss_watchdog_threshold: float | None -# Number of high-loss steps in a row before the trainer aborts (default: 3) -loss_watchdog_patience: int | None - -gc_steps: int | None - -# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. -# require >=ampere -bf16: Literal['auto'] | bool | None = auto -# Use CUDA fp16 -fp16: bool | None -fp8: bool | None -# No AMP (automatic mixed precision) - require >=ampere -bfloat16: bool | None -# No AMP (automatic mixed precision) -float16: bool | None -# Use CUDA tf32 - require >=ampere -tf32: bool | None -float32: bool | None - -# Whether to use gradient checkpointing. Available options are: true, false, 'offload', -# 'offload_disk'. -# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing -gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False -# Additional kwargs to pass to the trainer for gradient checkpointing -gradient_checkpointing_kwargs: dict[str, Any] | None - -unfrozen_parameters: list[str] | None - -# The maximum length of an input to train with, this should typically be less than 2048 -# as most models have a token/context limit of 2048 -sequence_len: int = 512 -min_sample_len: int | None -# maximum prompt length for RL training -max_prompt_len: int = 512 -# Use efficient multi-packing with block diagonal attention and per sequence -# position_ids. Recommend set to 'true' -sample_packing: bool | None -# The number of samples packed at a time. Increasing the following values helps with -# packing, but usually only slightly (<%1.) -sample_packing_group_size: int | None = 100000 -# The number of samples which can be packed into one sequence. Increase if using a large -# sequence_len with many short samples. -sample_packing_bin_size: int | None = 200 -# Whether to pack samples sequentially -sample_packing_sequentially: bool | None -# Set to 'false' if getting errors during eval with sample_packing on -eval_sample_packing: bool | None -# Pad inputs so each step uses constant sized buffers. This will reduce memory -# fragmentation and may prevent OOMs, by re-using memory more efficiently -pad_to_sequence_len: bool | None -# Whether to use sequential sampling for curriculum learning -curriculum_sampling: bool | None -multipack_real_batches: bool | None -# whether to concatenate samples during pretraining -pretraining_sample_concatenation: bool | None - -# Use batch flattening for speedups when not using sample_packing -batch_flattening: Literal['auto'] | bool | None - -use_pose: bool | None -pose_split_on_token_ids: list[int] | None -pose_max_context_len: int | None -pose_num_chunks: int | None - -pretrain_multipack_buffer_size: int | None = 10000 -# whether to prevent cross attention for packed sequences during pretraining -pretrain_multipack_attn: bool | None = True - -# Whether to use xformers attention patch https://github.com/facebookresearch/xformers -xformers_attention: bool | None -# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/ -# torch.nn.functional.scaled_dot_product_attention.html -sdp_attention: bool | None -# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf -s2_attention: bool | None -flex_attention: bool | None -flex_attn_compile_kwargs: dict[str, Any] | None -# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention -flash_attention: bool | None -# Whether to use flash-attention cross entropy implementation - advanced use only -flash_attn_cross_entropy: bool | None -# Whether to use flash-attention rms norm implementation - advanced use only -flash_attn_rms_norm: bool | None -# Whether to fuse QKV into a single operation -flash_attn_fuse_qkv: bool | None -# Whether to fuse part of the MLP into a single operation -flash_attn_fuse_mlp: bool | None -# Whether to use bettertransformers -flash_optimum: bool | None - -eager_attention: bool | None - -unsloth_cross_entropy_loss: bool | None -unsloth_lora_mlp: bool | None -unsloth_lora_qkv: bool | None -unsloth_lora_o: bool | None -unsloth_rms_norm: bool | None -unsloth_rope: bool | None - -# Apply custom LoRA autograd functions and activation function Triton kernels for speed -# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html -lora_mlp_kernel: bool | None -# Apply custom LoRA autograd functions and activation function Triton kernels for speed -# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html -lora_qkv_kernel: bool | None -# Apply custom LoRA autograd functions and activation function Triton kernels for speed -# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html -lora_o_kernel: bool | None - -llama4_linearized_experts: bool | None - -# Deepspeed config path. e.g., deepspeed_configs/zero3.json -deepspeed: str | dict[str, Any] | None -# FSDP configuration -fsdp: list[str] | None -# FSDP configuration options -fsdp_config: dict[str, Any] | None -fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None - -# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for -# no eval. -val_set_size: float | None = 0.0 - -# Set to a divisor of the number of GPUs available to split sequences into chunks of -# equal size. Use in long context training to prevent OOM when sequences cannot fit into -# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each -# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized -# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more -# details. -sequence_parallel_degree: int | None -# Optional; strides across the key dimension. Larger values use more memory but should -# make training faster. Must evenly divide the number of KV heads in your model. -heads_k_stride: int | None -# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to -# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing -# case. -ring_attn_func: RingAttnFunc | None - -# Add or change special tokens. If you add tokens here, you don't need to add them to -# the `tokens` list. -special_tokens: - bos_token: str | None - eos_token: str | None - pad_token: str | None - unk_token: str | None - additional_special_tokens: list[str] | None - -# Add extra tokens to the tokenizer -tokens: list[str] | None -# Mapping token_id to new_token_string to override reserved added_tokens in the -# tokenizer. Only works for tokens that are not part of the base vocab (aka are -# added_tokens). Can be checked if they exist in tokenizer.json added_tokens. -added_tokens_overrides: dict[int, str] | None - -# Whether to use torch.compile and which backend to use. setting to `auto` will enable -# torch compile when torch>=2.5.1 -torch_compile: Literal['auto'] | bool | None -# Backend to use for torch.compile -torch_compile_backend: str | None -torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None - -# Maximum number of iterations to train for. It precedes num_epochs which means that if -# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => -# `num_epochs: 2` and `max_steps: 100` will train for 100 steps -max_steps: int | None -# Number of warmup steps. Cannot use with warmup_ratio -warmup_steps: int | None -# Warmup ratio. Cannot use with warmup_steps -warmup_ratio: float | None -# Leave empty to eval at each epoch, integer for every N steps. float for fraction of -# total steps -eval_steps: int | float | None -# Number of times per epoch to run evals, mutually exclusive with eval_steps -evals_per_epoch: int | None -# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer -# from `eval_steps` -eval_strategy: str | None -# Leave empty to save at each epoch, integer for every N steps. float for fraction of -# total steps -save_steps: int | float | None -# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps -saves_per_epoch: int | None -# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better -# result is achieved, leave empty to infer from `save_steps` -save_strategy: str | None -# Checkpoints saved at a time -save_total_limit: int | None -# Logging frequency -logging_steps: int | None -# Stop training after this many evaluation losses have increased in a row. https://huggi -# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin -# gCallback -early_stopping_patience: int | None -load_best_model_at_end: bool | None = False -# Save only the model weights, skipping the optimizer. Using this means you can't resume -# from checkpoints. -save_only_model: bool | None = False -# Use tensorboard for logging -use_tensorboard: bool | None -# Enable the pytorch profiler to capture the first N steps of training to the -# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more -# information. Snapshots can be visualized @ https://pytorch.org/memory_viz -profiler_steps: int | None -# bool of whether to include tokens trainer per second in the training metrics. This -# iterates over the entire dataset once, so it takes some time. -include_tokens_per_second: bool | None - -# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to -# add noise to embeddings. Currently only supported on Llama and Mistral -neftune_noise_alpha: float | None - -# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to -# `beta` in `ORPOConfig` due to trl mapping. -orpo_alpha: float | None -# Weighting of NLL term in loss from RPO paper -rpo_alpha: float | None -# Target reward margin for the SimPO loss -simpo_gamma: float | None -# Weight of the BC regularizer -cpo_alpha: float | None - -# Factor for desirable loss term in KTO loss -kto_desirable_weight: float | None -# Factor for undesirable loss term in KTO loss -kto_undesirable_weight: float | None -# The beta parameter for the RL training -rl_beta: float | None - -# Defines the max memory usage per gpu on the system. Passed through to transformers -# when loading the model. -max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None -# Limit the memory for all available GPUs to this amount (if an integer, expressed in -# gigabytes); default: unset -gpu_memory_limit: int | str | None -# Whether to use low_cpu_mem_usage -low_cpu_mem_usage: bool | None - -# The name of the chat template to use for training, following values are supported: -# tokenizer_default: Uses the chat template that is available in the -# tokenizer_config.json. If the chat template is not available in the tokenizer, it will -# raise an error. This is the default value. -# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates -# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. -# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. -# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not -# available in the tokenizer. jinja: Uses a custom jinja template for the chat template. -# The custom jinja template should be provided in the chat_template_jinja field. The -# selected chat template will be saved to the tokenizer_config.json for easier -# inferencing -chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None -# Custom jinja template for chat template. This will be only used if chat_template is -# set to `jinja` or `null` (in which case chat_template is automatically set to -# `jinja`). Default is null. -chat_template_jinja: str | None -# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the -# boundaries between conversation turns. For example: ['/INST', '', -# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is -# useful for templates that use multiple delimiter tokens. -eot_tokens: list[str] | None -# Changes the default system message. Currently only supports chatml. -default_system_message: str | None - -fix_untrained_tokens: int | list[int] | None - -is_preprocess: bool | None -preprocess_iterable: bool | None - -# Total number of tokens - internal use -total_num_tokens: int | None -total_supervised_tokens: int | None -# You can set these packing optimizations AFTER starting a training at least once. The -# trainer will provide recommended values for these values. -sample_packing_eff_est: float | None -axolotl_config_path: str | None - -# Internal use only - Used to identify which the model is based on -is_falcon_derived_model: bool | None -# Internal use only - Used to identify which the model is based on -is_llama_derived_model: bool | None -# Internal use only - Used to identify which the model is based on. Please note that if -# you set this to true, `padding_side` will be set to 'left' by default -is_mistral_derived_model: bool | None -# Internal use only - Used to identify which the model is based on -is_qwen_derived_model: bool | None - -# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available -# plugins or doc below for more details. -# https://docs.axolotl.ai/docs/custom_integrations.html -plugins: list[str] | None - -# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This -# can also be a relative path to a model on disk -base_model: str (required) -# If the base_model repo on hf hub doesn't include configuration .json files, You can -# set that here, or leave this empty to default to base_model -base_model_config: str | None -cls_model_config: str | None -# Optional tokenizer configuration path in case you want to use a different tokenizer -# than the one defined in the base model -tokenizer_config: str | None -# use_fast option for tokenizer loading from_pretrained, default to True -tokenizer_use_fast: bool | None -# Whether to use the legacy tokenizer setting, defaults to True -tokenizer_legacy: bool | None -# Whether to use mistral-common tokenizer. If set to True, it will use the mistral- -# common tokenizer. -tokenizer_use_mistral_common: bool | None -# Corresponding tokenizer for the model AutoTokenizer is a good choice -tokenizer_type: str | None -# transformers processor class -processor_type: str | None -# Trust remote code for untrusted source -trust_remote_code: bool | None - -# Where to save the full-finetuned model to -output_dir: str = ./model-out -# push checkpoints to hub -hub_model_id: str | None -# how to push checkpoints to hub -hub_strategy: str | None -# Save model as safetensors (require safetensors package). Default True -save_safetensors: bool | None = True - -# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer -load_in_8bit: bool | None = False -# Use bitsandbytes 4 bit -load_in_4bit: bool | None = False - -# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in -# original model -adapter: str | None -# If you already have a lora model trained that you want to load, put that here. This -# means after training, if you want to test the model, you should set this to the value -# of `output_dir`. Note that if you merge an adapter to the base model, a new -# subdirectory `merged` will be created under the `output_dir`. -lora_model_dir: str | None -lora_r: int | None -lora_alpha: int | None -lora_fan_in_fan_out: bool | None -lora_target_modules: str | list[str] | None -# If true, will target all linear modules -lora_target_linear: bool | None -# If you added new tokens to the tokenizer, you may need to save some LoRA modules -# because they need to know the new tokens. For LLaMA and Mistral, you need to save -# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts -# tokens to embeddings, and `lm_head` converts embeddings to token probabilities. -lora_modules_to_save: list[str] | None -lora_dropout: float | None = 0.0 -# The layer indices to transform, otherwise, apply to all layers -peft_layers_to_transform: list[int] | None -peft_layers_pattern: list[str] | None - -peft: - # Configuration options for loftq initialization for LoRA - loftq_config: - # typically 4 bits - loftq_bits: int = 4 - -# Whether to use DoRA. -peft_use_dora: bool | None -# Whether to use RSLoRA. -peft_use_rslora: bool | None -# List of layer indices to replicate. -peft_layer_replication: list[tuple[int, int]] | None -# How to initialize LoRA weights. Default to True which is MS original implementation. -peft_init_lora_weights: bool | str | None - -# load qlora model in sharded format for FSDP using answer.ai technique. -qlora_sharded_model_loading: bool | None = False -# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it -# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge -lora_on_cpu: bool | None -# Whether you are training a 4-bit GPTQ quantized model -gptq: bool | None -# optional overrides to the bnb 4bit quantization configuration -bnb_config_kwargs: dict[str, Any] | None - -# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4. -loraplus_lr_ratio: float | None -# loraplus learning rate for lora embedding layers. Default value is 1e-6. -loraplus_lr_embedding: float | None = 1e-06 - -merge_lora: bool | None - -# Number of steps per ReLoRA restart -relora_steps: int | None -# Number of per-restart warmup steps -relora_warmup_steps: int | None -# Number of anneal steps for each relora cycle -relora_anneal_steps: int | None -# threshold for optimizer magnitude when pruning -relora_prune_ratio: float | None -# True to perform lora weight merges on cpu during restarts, for modest gpu memory -# savings -relora_cpu_offload: bool | None - -# If greater than 1, backpropagation will be skipped and the gradients will be -# accumulated for the given number of steps. -gradient_accumulation_steps: int | None = 1 -# The number of samples to include in each batch. This is the number of samples sent to -# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps -micro_batch_size: int | None = 1 -# Total batch size, we do not recommended setting this manually -batch_size: int | None -# per gpu micro batch size for evals, defaults to value of micro_batch_size -eval_batch_size: int | None - -# whether to find batch size that fits in memory. Passed to underlying transformers -# Trainer -auto_find_batch_size: bool | None - -# Whether to mask out or include the human's prompt from the training labels -train_on_inputs: bool | None = False -# Group similarly sized data to minimize padding. May be slower to start, as it must -# download and sort the entire dataset. Note that training loss may have an oscillating -# pattern with this enabled. -group_by_length: bool | None - -learning_rate: str | float (required) -embedding_lr: float | None -embedding_lr_scale: float | None -# Specify weight decay -weight_decay: float | None = 0.0 -# Specify optimizer -optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED -# Dictionary of arguments to pass to the optimizer -optim_args: str | dict[str, Any] | None -# The target modules to optimize, i.e. the module names that you would like to train, -# right now this is used only for GaLore algorithm -optim_target_modules: list[str] | Literal['all_linear'] | None -# Path to torch distx for optim 'adamw_anyprecision' -torchdistx_path: str | None -lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE -# Specify a scheduler and kwargs to use with the optimizer -lr_scheduler_kwargs: dict[str, Any] | None -lr_quadratic_warmup: bool | None -# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of -# peak lr -cosine_min_lr_ratio: float | None -# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means -# start cosine_min_lr at 80% of training step -cosine_constant_lr_ratio: float | None -# Learning rate div factor -lr_div_factor: float | None -lr_groups: list[LrGroup] | None - -# adamw hyperparams -adam_epsilon: float | None -# only used for CAME Optimizer -adam_epsilon2: float | None -# adamw hyperparams -adam_beta1: float | None -# adamw hyperparams -adam_beta2: float | None -# only used for CAME Optimizer -adam_beta3: float | None -# Gradient clipping max norm -max_grad_norm: float | None -num_epochs: float = 1.0 - -use_wandb: bool | None -# Set the name of your wandb run -wandb_name: str | None -# Set the ID of your wandb run -wandb_run_id: str | None -# "offline" to save run metadata locally and not sync to the server, "disabled" to turn -# off wandb -wandb_mode: str | None -# Your wandb project name -wandb_project: str | None -# A wandb Team name if using a Team -wandb_entity: str | None -wandb_watch: str | None -# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only -# at the end of training -wandb_log_model: str | None - -use_mlflow: bool | None -# URI to mlflow -mlflow_tracking_uri: str | None -# Your experiment name -mlflow_experiment_name: str | None -# Your run name -mlflow_run_name: str | None -# set to true to copy each saved checkpoint on each save to mlflow artifact registry -hf_mlflow_log_artifacts: bool | None - -# Enable or disable Comet integration. -use_comet: bool | None -# API key for Comet. Recommended to set via `comet login`. -comet_api_key: str | None -# Workspace name in Comet. Defaults to the user's default workspace. -comet_workspace: str | None -# Project name in Comet. Defaults to Uncategorized. -comet_project_name: str | None -# Identifier for the experiment. Used to append data to an existing experiment or -# control the key of new experiments. Default to a random key. -comet_experiment_key: str | None -# Create a new experiment ("create") or log to an existing one ("get"). Default -# ("get_or_create") auto-selects based on configuration. -comet_mode: str | None -# Set to True to log data to Comet server, or False for offline storage. Default is -# True. -comet_online: bool | None -# Dictionary for additional configuration settings, see the doc for more details. -comet_experiment_config: dict[str, Any] | None - -# the number of activate layers in LISA -lisa_n_layers: int | None -# how often to switch layers in LISA -lisa_step_interval: int | None -# path under the model to access the layers -lisa_layers_attribute: str | None = model.layers - -gradio_title: str | None -gradio_share: bool | None -gradio_server_name: str | None -gradio_server_port: int | None -gradio_max_new_tokens: int | None -gradio_temperature: float | None - -use_ray: bool = False -ray_run_name: str | None -ray_num_workers: int = 1 -resources_per_worker: dict - -# The size of the image to resize to. It can be an integer (resized into padded-square -# image) or a tuple (width, height).If not provided, we will attempt to load from -# preprocessor.size, otherwise, images won't be resized. -image_size: int | tuple[int, int] | None -# The resampling algorithm to use for image resizing. Default is bilinear. Please refer -# to PIL.Image.Resampling for more details. -image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None - -# optional overrides to the base model configuration -overrides_of_model_config: dict[str, Any] | None -# optional overrides the base model loading from_pretrained -overrides_of_model_kwargs: dict[str, Any] | None -# If you want to specify the type of model to load, AutoModelForCausalLM is a good -# choice too -type_of_model: str | None -# You can specify to choose a specific model revision from huggingface hub -revision_of_model: str | None - -max_packed_sequence_len: int | None -rope_scaling: Any | None -noisy_embedding_alpha: float | None -dpo_beta: float | None -evaluation_strategy: str | None -```