diff --git a/docs/qat.qmd b/docs/qat.qmd index e0d000a79..ad9779066 100644 --- a/docs/qat.qmd +++ b/docs/qat.qmd @@ -23,10 +23,17 @@ To enable QAT in axolotl, add the following to your configuration file: ```yaml qat: - activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8" - weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8" + activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8" + weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4". group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after ``` +We support the following quantization schemas: +- `Int4WeightOnly` (requires the `fbgemm-gpu` extra when installing Axolotl) +- `Int8DynamicActivationInt4Weight` +- `Float8DynamicActivationFloat8Weight` +- `Float8DynamicActivationInt4Weight` +- `NVFP4` + Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this. diff --git a/docs/quantize.qmd b/docs/quantize.qmd index 43c817a5b..9c3de1ef1 100644 --- a/docs/quantize.qmd +++ b/docs/quantize.qmd @@ -22,8 +22,8 @@ Quantization is configured using the `quantization` key in your configuration fi ```yaml base_model: # The path to the model to quantize. quantization: - weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8 - activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8" + activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8" + weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4". group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer. @@ -39,9 +39,8 @@ you used to train the model: # qat.yml qat: activation_dtype: int8 - weight_dtype: int8 + weight_dtype: int4 group_size: 256 - quantize_embedding: true output_dir: # The path to the output directory used during training where the final checkpoint has been saved. ``` diff --git a/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml b/examples/llama-3/3b-qat-nvfp4.yaml similarity index 100% rename from examples/llama-3/3b-qat-fsdp2-nvfp4.yaml rename to examples/llama-3/3b-qat-nvfp4.yaml