update config doc

This commit is contained in:
Sunny Liu
2025-04-21 10:59:04 -04:00
parent ac24eba2ac
commit 320aff1867

View File

@@ -55,19 +55,30 @@ overrides_of_model_config:
overrides_of_model_kwargs: overrides_of_model_kwargs:
# use_cache: False # use_cache: False
# optional overrides to the bnb 4bit quantization configuration
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config_kwargs: # Quantization configuration.
# See: https://huggingface.co/docs/transformers/main/en//quantization/hqq
# https://github.com/mobiusml/hqq
quantization:
backend: bnb | hqq | gptq
bits: 8
# optional overrides to the bnb 4bit quantization configuration
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config_kwargs:
# These are default values # These are default values
llm_int8_has_fp16_weight: false llm_int8_has_fp16_weight: false
bnb_4bit_quant_type: nf4 bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: true bnb_4bit_use_double_quant: true
# Overrides quantization method to use HQQ instead of default bnb. # If using hqq config, additional config paramters are needed. See: https://huggingface.co/docs/transformers/main/en//quantization/hqq
# See: https://huggingface.co/docs/transformers/main/en//quantization/hqq hqq_config:
# https://github.com/mobiusml/hqq # pick one of the following, depending on if you want to uniformly quantize the whole model or
use_hqq: true # apply different quantization settings to specific layers in the model:
hqq_config:
# if uniformly quantize the whole model:
group_size: 64
# if we want to invoke dynamic_config in order to apply specific layers with different quantization settings:
- nbits: 4 - nbits: 4
group_size: 64 group_size: 64
target_modules: target_modules:
@@ -81,12 +92,11 @@ hqq_config:
- mlp.up_proj - mlp.up_proj
- mlp.down_proj - mlp.down_proj
# (Internal Use Only)
# Whether you are training a 4-bit GPTQ quantized model # Whether you are training a 4-bit GPTQ quantized model
gptq: true gptq:
# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
load_in_8bit: true load_in_8bit:
# Use bitsandbytes 4 bit # Use bitsandbytes 4 bit
load_in_4bit: load_in_4bit: