update config doc

This commit is contained in:
Sunny Liu
2025-04-21 10:59:04 -04:00
parent ac24eba2ac
commit 320aff1867

View File

@@ -55,38 +55,48 @@ overrides_of_model_config:
overrides_of_model_kwargs: overrides_of_model_kwargs:
# use_cache: False # use_cache: False
# optional overrides to the bnb 4bit quantization configuration
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config_kwargs:
# These are default values
llm_int8_has_fp16_weight: false
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: true
# Overrides quantization method to use HQQ instead of default bnb.
# Quantization configuration.
# See: https://huggingface.co/docs/transformers/main/en//quantization/hqq # See: https://huggingface.co/docs/transformers/main/en//quantization/hqq
# https://github.com/mobiusml/hqq # https://github.com/mobiusml/hqq
use_hqq: true quantization:
hqq_config: backend: bnb | hqq | gptq
- nbits: 4 bits: 8
# optional overrides to the bnb 4bit quantization configuration
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config_kwargs:
# These are default values
llm_int8_has_fp16_weight: false
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: true
# If using hqq config, additional config paramters are needed. See: https://huggingface.co/docs/transformers/main/en//quantization/hqq
hqq_config:
# pick one of the following, depending on if you want to uniformly quantize the whole model or
# apply different quantization settings to specific layers in the model:
# if uniformly quantize the whole model:
group_size: 64 group_size: 64
target_modules: # if we want to invoke dynamic_config in order to apply specific layers with different quantization settings:
- self_attn.k_proj - nbits: 4
- self_attn.v_proj group_size: 64
- self_attn.o_proj target_modules:
- nbits: 3 - self_attn.k_proj
group_size: 32 - self_attn.v_proj
target_modules: - self_attn.o_proj
- mlp.gate_proj - nbits: 3
- mlp.up_proj group_size: 32
- mlp.down_proj target_modules:
- mlp.gate_proj
- mlp.up_proj
- mlp.down_proj
# (Internal Use Only)
# Whether you are training a 4-bit GPTQ quantized model # Whether you are training a 4-bit GPTQ quantized model
gptq: true gptq:
# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
load_in_8bit: true load_in_8bit:
# Use bitsandbytes 4 bit # Use bitsandbytes 4 bit
load_in_4bit: load_in_4bit: