Add support for GPTQ using native transformers/peft (#468)

* auto gptq support * more tweaks and add yml * remove old gptq docker * don't need explicit peft install for tests * fix setup.py to use extra index url install torch for tests fix cuda version for autogptq index set torch in requirements so that it installs properly move gptq install around to work with github cicd * gptq doesn't play well with sample packing * address pr feedback * remove torch install for now * set quantization_config from model config * Fix the implementation for getting quant config from model config
2023-09-05 12:43:22 -04:00
parent daa4faca12
commit 3355706e22
11 changed files with 142 additions and 210 deletions
--- a/examples/gptq-lora-7b/README.md
+++ b/examples/gptq-lora-7b/README.md
@@ -1,8 +0,0 @@
-# LLaMa 7B using LoRA
-
-This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
-
-```shell
-accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
-
-```
--- a/examples/gptq-lora-7b/config.yml
+++ b/examples/gptq-lora-7b/config.yml
@@ -1,63 +0,0 @@
-base_model: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
-base_model_config: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-trust_remote_code:
-load_in_8bit: true
-gptq: true
-datasets:
-  - path: vicgalle/alpaca-gpt4
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.02
-adapter:
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len:
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: llama-7b-lora-int4
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./llama-7b-lora-int4
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-torchdistx_path:
-lr_scheduler: cosine
-learning_rate: 0.0000002
-train_on_inputs: false
-group_by_length: false
-fp16: true
-bf16: false
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 5
-xformers_attention:
-flash_attention:
-gradient_checkpointing: true
-gptq_groupsize: 128
-gptq_model_v1: false
-warmup_steps: 20
-eval_steps: 110
-save_steps: 660
-debug:
-deepspeed:
-weight_decay: 0.0001
-fsdp:
-fsdp_config:
-tokens:
-  pad_token: "<pad>"
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -0,0 +1,76 @@
+base_model: TheBloke/Llama-2-7B-GPTQ
+base_model_config: TheBloke/Llama-2-7B-GPTQ
+is_llama_derived_model: false
+gptq: true
+gptq_bits: 4
+model_type: AutoModelForCausalLM
+tokenizer_type: LlamaTokenizer
+tokenizer_use_fast: true
+tokenizer_legacy: true
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+hf_use_auth_token: true
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+adapter: lora
+lora_model_dir:
+sequence_len: 4096
+sample_packing:
+lora_r: 8
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - k_proj
+  - o_proj
+  - q_proj
+  - v_proj
+lora_target_linear:
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./model-out
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+adam_beta2: 0.95
+adam_eps: 0.00001
+max_grad_norm: 1.0
+torchdistx_path:
+lr_scheduler: cosine
+lr_quadratic_warmup: true
+learning_rate: 0.000017
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: false
+float16: true
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention:
+sdp_attention:
+flash_optimum:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 100
+eval_steps:
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"