Pydantic 2.x cfg (#1239)

* WIP conversion to use pydantic for config validation * wip, more fields, add capabilities * wip * update pydantic validation to match existing tests * tweak requirements * setup deprecated paams pydantic model * more validations * wrap up rest of the validations * flesh out the rest of the options from the readme into pydantic * fix model validators as class methods remember to return in validator missing return add missing relora attributes fix test for DictDefault change fix sys template for mistral from fastchat change in PR 2872 fix test for batch size warning * more missing attributes for cfg * updates from PR feedback * fix validation for datasets and pretrain datasets * fix test for lora check
2024-02-26 12:24:14 -05:00
parent 5894f0e57e
commit cc3cebfa70
16 changed files with 1710 additions and 410 deletions
--- a/README.md
+++ b/README.md
@@ -543,7 +543,7 @@ is_mistral_derived_model:
 is_qwen_derived_model:

 # optional overrides to the base model configuration
-model_config:
+model_config_overrides:
  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
  rope_scaling:
    type: # linear | dynamic
@@ -560,8 +560,6 @@ bnb_config_kwargs:

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
-gptq_groupsize: 128 # group size
-gptq_model_v1: false # v1 or v2

 # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 load_in_8bit: true
@@ -819,10 +817,6 @@ cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosin
 # For one_cycle optim
 lr_div_factor: # Learning rate div factor

-# For log_sweep optim
-log_sweep_min_lr:
-log_sweep_max_lr:
-
 # Specify optimizer
 # Valid values are driven by the Transformers OptimizerNames class, see:
 # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134