From 3355706e22d95ddca7dc242fe429f06d4ce89526 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 5 Sep 2023 12:43:22 -0400 Subject: [PATCH] Add support for GPTQ using native transformers/peft (#468) * auto gptq support * more tweaks and add yml * remove old gptq docker * don't need explicit peft install for tests * fix setup.py to use extra index url install torch for tests fix cuda version for autogptq index set torch in requirements so that it installs properly move gptq install around to work with github cicd * gptq doesn't play well with sample packing * address pr feedback * remove torch install for now * set quantization_config from model config * Fix the implementation for getting quant config from model config --- .github/workflows/main.yml | 10 --- .github/workflows/tests.yml | 2 +- docker/Dockerfile | 5 +- examples/gptq-lora-7b/README.md | 8 -- examples/gptq-lora-7b/config.yml | 63 ---------------- examples/llama-2/gptq-lora.yml | 76 +++++++++++++++++++ requirements.txt | 4 + setup.py | 39 ++++++---- src/axolotl/utils/config.py | 4 +- src/axolotl/utils/models.py | 123 +++++++++---------------------- src/axolotl/utils/trainer.py | 18 +---- 11 files changed, 142 insertions(+), 210 deletions(-) delete mode 100644 examples/gptq-lora-7b/README.md delete mode 100644 examples/gptq-lora-7b/config.yml create mode 100644 examples/llama-2/gptq-lora.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d20db7065..30d4774db 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -23,11 +23,6 @@ jobs: python_version: "3.10" pytorch: 2.0.1 axolotl_extras: - - cuda: 118 - cuda_version: 11.8.0 - python_version: "3.9" - pytorch: 2.0.1 - axolotl_extras: gptq runs-on: self-hosted steps: - name: Checkout @@ -73,11 +68,6 @@ jobs: pytorch: 2.0.1 axolotl_extras: is_latest: true - - cuda: 118 - cuda_version: 11.8.0 - python_version: "3.9" - pytorch: 2.0.1 - axolotl_extras: gptq runs-on: self-hosted steps: - name: Checkout diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 01703cd51..d5184def6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,7 +24,7 @@ jobs: - name: Install dependencies run: | - pip install -e .[peft] + pip install -e . pip install -r requirements-tests.txt - name: Run tests diff --git a/docker/Dockerfile b/docker/Dockerfile index b429d50f2..683ca75ff 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -11,14 +11,13 @@ RUN apt-get update && \ WORKDIR /workspace -RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main" RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git # If AXOLOTL_EXTRAS is set, append it in brackets RUN cd axolotl && \ if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[flash-attn,$AXOLOTL_EXTRAS]; \ + pip install -e .[flash-attn,gptq,$AXOLOTL_EXTRAS]; \ else \ - pip install -e .[flash-attn]; \ + pip install -e .[flash-attn,gptq]; \ fi # fix so that git fetch/pull from remote works diff --git a/examples/gptq-lora-7b/README.md b/examples/gptq-lora-7b/README.md deleted file mode 100644 index 0bde51b06..000000000 --- a/examples/gptq-lora-7b/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# LLaMa 7B using LoRA - -This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed. - -```shell -accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml - -``` diff --git a/examples/gptq-lora-7b/config.yml b/examples/gptq-lora-7b/config.yml deleted file mode 100644 index d909f7d07..000000000 --- a/examples/gptq-lora-7b/config.yml +++ /dev/null @@ -1,63 +0,0 @@ -base_model: Neko-Institute-of-Science/LLaMA-7B-4bit-128g -base_model_config: Neko-Institute-of-Science/LLaMA-7B-4bit-128g -model_type: LlamaForCausalLM -tokenizer_type: LlamaTokenizer -trust_remote_code: -load_in_8bit: true -gptq: true -datasets: - - path: vicgalle/alpaca-gpt4 - type: alpaca -dataset_prepared_path: last_run_prepared -val_set_size: 0.02 -adapter: -lora_model_dir: -sequence_len: 2048 -max_packed_sequence_len: -lora_r: 8 -lora_alpha: 16 -lora_dropout: 0.05 -lora_target_modules: - - q_proj - - v_proj -lora_fan_in_fan_out: false -wandb_project: llama-7b-lora-int4 -wandb_entity: -wandb_watch: -wandb_run_id: -wandb_log_model: -output_dir: ./llama-7b-lora-int4 -gradient_accumulation_steps: 1 -micro_batch_size: 1 -num_epochs: 3 -optimizer: adamw_bnb_8bit -torchdistx_path: -lr_scheduler: cosine -learning_rate: 0.0000002 -train_on_inputs: false -group_by_length: false -fp16: true -bf16: false -tf32: true -early_stopping_patience: -resume_from_checkpoint: -local_rank: -logging_steps: 5 -xformers_attention: -flash_attention: -gradient_checkpointing: true -gptq_groupsize: 128 -gptq_model_v1: false -warmup_steps: 20 -eval_steps: 110 -save_steps: 660 -debug: -deepspeed: -weight_decay: 0.0001 -fsdp: -fsdp_config: -tokens: - pad_token: "" - bos_token: "" - eos_token: "" - unk_token: "" diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml new file mode 100644 index 000000000..dbce2a6b3 --- /dev/null +++ b/examples/llama-2/gptq-lora.yml @@ -0,0 +1,76 @@ +base_model: TheBloke/Llama-2-7B-GPTQ +base_model_config: TheBloke/Llama-2-7B-GPTQ +is_llama_derived_model: false +gptq: true +gptq_bits: 4 +model_type: AutoModelForCausalLM +tokenizer_type: LlamaTokenizer +tokenizer_use_fast: true +tokenizer_legacy: true +load_in_8bit: false +load_in_4bit: false +strict: false +push_dataset_to_hub: +hf_use_auth_token: true +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +adapter: lora +lora_model_dir: +sequence_len: 4096 +sample_packing: +lora_r: 8 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: + - k_proj + - o_proj + - q_proj + - v_proj +lora_target_linear: +lora_fan_in_fan_out: +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: +output_dir: ./model-out +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 3 +optimizer: adamw_torch +adam_beta2: 0.95 +adam_eps: 0.00001 +max_grad_norm: 1.0 +torchdistx_path: +lr_scheduler: cosine +lr_quadratic_warmup: true +learning_rate: 0.000017 +train_on_inputs: false +group_by_length: false +bf16: false +fp16: false +float16: true +tf32: true +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: +sdp_attention: +flash_optimum: +gptq_groupsize: +gptq_model_v1: +warmup_steps: 100 +eval_steps: +save_steps: +debug: +deepspeed: +weight_decay: 0.1 +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/requirements.txt b/requirements.txt index fcd7f9292..1c8e97dff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ +torch==2.0.1 +auto-gptq packaging peft @ git+https://github.com/huggingface/peft.git transformers @ git+https://github.com/huggingface/transformers.git diff --git a/setup.py b/setup.py index 7b99794de..973d656cd 100644 --- a/setup.py +++ b/setup.py @@ -2,15 +2,27 @@ from setuptools import find_packages, setup -install_requires = [] -with open("./requirements.txt", encoding="utf-8") as requirements_file: - # don't include peft yet until we check the int4 - # need to manually install peft for now... - reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r] - reqs = [r for r in reqs if "flash-attn" not in r] - reqs = [r for r in reqs if r and r[0] != "#"] - for r in reqs: - install_requires.append(r) + +def parse_requirements(): + _install_requires = [] + _dependency_links = [] + with open("./requirements.txt", encoding="utf-8") as requirements_file: + lines = [ + r.strip() for r in requirements_file.readlines() if "auto-gptq" not in r + ] + for line in lines: + if line.startswith("--extra-index-url"): + # Handle custom index URLs + _, url = line.split() + _dependency_links.append(url) + elif "flash-attn" not in line and line and line[0] != "#": + # Handle standard packages + _install_requires.append(line) + return _install_requires, _dependency_links + + +install_requires, dependency_links = parse_requirements() + setup( name="axolotl", @@ -19,12 +31,10 @@ setup( package_dir={"": "src"}, packages=find_packages(), install_requires=install_requires, + dependency_links=dependency_links, extras_require={ "gptq": [ - "alpaca_lora_4bit @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip", - ], - "gptq_triton": [ - "alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip", + "auto-gptq", ], "flash-attn": [ "flash-attn==2.0.8", @@ -32,8 +42,5 @@ setup( "extras": [ "deepspeed", ], - "peft": [ - "peft @ git+https://github.com/huggingface/peft.git", - ], }, ) diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py index 93a23f773..0fbccd205 100644 --- a/src/axolotl/utils/config.py +++ b/src/axolotl/utils/config.py @@ -108,9 +108,7 @@ def validate_config(cfg): "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.", ) if cfg.load_4bit: - raise ValueError( - "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq" - ) + raise ValueError("cfg.load_4bit parameter has been deprecated") if cfg.adapter == "qlora": if cfg.merge_lora: diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 9f0795af7..9ec51f4f7 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -4,19 +4,19 @@ import logging import math import os -from pathlib import Path from typing import Optional, Tuple # noqa: F401 import bitsandbytes as bnb import torch import transformers from optimum.bettertransformer import BetterTransformer -from peft import PeftConfig +from peft import PeftConfig, prepare_model_for_kbit_training from transformers import ( # noqa: F401 AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, + GPTQConfig, LlamaConfig, PreTrainedModel, PreTrainedTokenizerBase, @@ -155,32 +155,17 @@ def load_model( LOG.info("patching _expand_mask") hijack_expand_mask() - try: - if cfg.gptq: - from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import ( - replace_peft_model_with_int4_lora_model, - ) - - replace_peft_model_with_int4_lora_model() - except Exception as err: - LOG.exception(err) - raise err - - if not cfg.gptq and ( - (cfg.adapter == "lora" and load_in_8bit) - or (cfg.adapter == "qlora" and cfg.load_in_4bit) - ): - try: - from peft import prepare_model_for_kbit_training - except ImportError: - # For backward compatibility - from peft import ( - prepare_model_for_int8_training as prepare_model_for_kbit_training, - ) - model_kwargs = {} if cfg.model_revision: model_kwargs["revision"] = cfg.model_revision + if cfg.gptq: + model_config = load_model_config(cfg) + if hasattr(model_config, "quantization_config"): + LOG.warning("model config does not contain quantization_config information") + else: + model_kwargs["quantization_config"] = GPTQConfig( + **model_config.quantization_config + ) if cfg.adapter == "qlora" and cfg.load_in_4bit: model_kwargs["quantization_config"] = BitsAndBytesConfig( load_in_4bit=True, @@ -191,45 +176,7 @@ def load_model( bnb_4bit_quant_type="nf4", ) try: - if cfg.gptq and cfg.is_llama_derived_model: - from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram - from huggingface_hub import snapshot_download - - try: - snapshot_download_kwargs = {} - if cfg.base_model_ignore_patterns: - snapshot_download_kwargs[ - "ignore_patterns" - ] = cfg.base_model_ignore_patterns - cache_model_path = Path( - snapshot_download(base_model, **snapshot_download_kwargs) - ) - files = ( - list(cache_model_path.glob("*.pt")) - + list(cache_model_path.glob("*.safetensors")) - + list(cache_model_path.glob("*.bin")) - ) - if len(files) > 0: - model_path = str(files[0]) - else: - LOG.warning( - "unable to find a cached model file, this will likely fail..." - ) - model_path = str(cache_model_path) - except Exception: # pylint: disable=broad-exception-caught - model_path = cfg.base_model - model, _ = load_llama_model_4bit_low_ram( - base_model_config if base_model_config else base_model, - model_path, - device_map=cfg.device_map, - half=cfg.fp16, - groupsize=cfg.gptq_groupsize if cfg.gptq_groupsize else -1, - is_v1_model=cfg.gptq_model_v1 - if cfg.gptq_model_v1 is not None - else True, - ) - load_in_8bit = False - elif cfg.is_llama_derived_model and not cfg.trust_remote_code: + if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq: from transformers import LlamaForCausalLM config_kwargs = {} @@ -275,15 +222,24 @@ def load_model( # ) # model.train() # sets to train instead of eval mode elif model_type and not cfg.trust_remote_code: - model = getattr(transformers, model_type).from_pretrained( - base_model, - device_map=cfg.device_map, - load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, - load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, - torch_dtype=cfg.torch_dtype, - trust_remote_code=cfg.trust_remote_code or False, - **model_kwargs, - ) + if cfg.gptq: + model = AutoModelForCausalLM.from_pretrained( + base_model, + device_map=cfg.device_map, + torch_dtype=cfg.torch_dtype, + trust_remote_code=cfg.trust_remote_code or False, + **model_kwargs, + ) + else: + model = getattr(transformers, model_type).from_pretrained( + base_model, + device_map=cfg.device_map, + load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, + load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, + torch_dtype=cfg.torch_dtype, + trust_remote_code=cfg.trust_remote_code or False, + **model_kwargs, + ) else: config = AutoConfig.from_pretrained( base_model, @@ -359,11 +315,12 @@ def load_model( module.to(torch.float32) needs_fa2_dtype = cfg.adapter or cfg.fsdp - if not cfg.gptq and ( - (cfg.adapter == "lora" and load_in_8bit) - or (cfg.adapter == "qlora" and cfg.load_in_4bit) + if (cfg.adapter == "lora" and load_in_8bit) or ( + cfg.adapter == "qlora" and cfg.load_in_4bit ): LOG.info("converting PEFT model w/ prepare_model_for_kbit_training") + if cfg.gradient_checkpointing: + model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training( model, use_gradient_checkpointing=cfg.gradient_checkpointing ) @@ -385,22 +342,10 @@ def load_model( if cfg.ddp and not load_in_8bit: model.to(f"cuda:{cfg.local_rank}") - if cfg.gptq: - # Scales to half - LOG.info("Fitting 4bit scales and zeros to half") - for _, module in model.named_modules(): - if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str( - type(module) - ): - if hasattr(module, "is_v1_model") and module.is_v1_model: - module.zeros = module.zeros.half() - module.scales = module.scales.half() - module.bias = module.bias.half() - if ( torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) > 1 - and (cfg.gptq or cfg.load_in_4bit) + and (cfg.load_in_4bit) ): # llama is PROBABLY model parallelizable, but the default isn't that it is # so let's only set it for the 4bit, see diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index f91f4e318..c3d6b85cb 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -514,23 +514,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_ training_arguments_kwargs["seed"] = cfg.seed if cfg.gradient_checkpointing: - if cfg.gptq: - from alpaca_lora_4bit.gradient_checkpointing import ( - apply_gradient_checkpointing, - ) - - gradient_checkpointing_ratio = ( - cfg.gradient_checkpointing_ratio - if cfg.gradient_checkpointing_ratio - else 1.0 - ) - apply_gradient_checkpointing( - model, checkpoint_ratio=gradient_checkpointing_ratio - ) - else: - training_arguments_kwargs[ - "gradient_checkpointing" - ] = cfg.gradient_checkpointing + training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing if cfg.fsdp: training_arguments_kwargs["fsdp"] = cfg.fsdp if cfg.fsdp_config: