auto gptq support

2023-08-23 21:16:48 -04:00
parent 548787daae
commit 1deb767fe8
4 changed files with 44 additions and 107 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/
+auto-gptq
 packaging
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -97,9 +97,7 @@ def validate_config(cfg):
            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
        )
    if cfg.load_4bit:
-        raise ValueError(
-            "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
-        )
+        raise ValueError("cfg.load_4bit parameter has been deprecated")

    if cfg.adapter == "qlora":
        if cfg.merge_lora:
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -4,19 +4,29 @@
 import logging
 import math
 import os
+<<<<<<< HEAD
 from pathlib import Path
 from typing import Optional, Tuple  # noqa: F401
+=======
+from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401
+>>>>>>> 10d25df (auto gptq support)

 import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
+<<<<<<< HEAD
 from peft import PeftConfig
+=======
+from peft import prepare_model_for_kbit_training
+from peft.tuners.lora import LoraLayer
+>>>>>>> 10d25df (auto gptq support)
 from transformers import (  # noqa: F401
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
+    GPTQConfig,
    LlamaConfig,
    PreTrainedModel,
    PreTrainedTokenizerBase,
@@ -155,32 +165,15 @@ def load_model(
        LOG.info("patching _expand_mask")
        hijack_expand_mask()

-    try:
-        if cfg.gptq:
-            from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
-                replace_peft_model_with_int4_lora_model,
-            )
-
-            replace_peft_model_with_int4_lora_model()
-    except Exception as err:
-        LOG.exception(err)
-        raise err
-
-    if not cfg.gptq and (
-        (cfg.adapter == "lora" and load_in_8bit)
-        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
-    ):
-        try:
-            from peft import prepare_model_for_kbit_training
-        except ImportError:
-            # For backward compatibility
-            from peft import (
-                prepare_model_for_int8_training as prepare_model_for_kbit_training,
-            )
-
    model_kwargs = {}
    if cfg.model_revision:
        model_kwargs["revision"] = cfg.model_revision
+    if cfg.gptq:
+        # TODO we should figure out how read the models config.json first
+        model_kwargs["quantization_config"] = GPTQConfig(
+            bits=cfg.gptq_bits,
+            disable_exllama=True,
+        )
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -191,45 +184,7 @@ def load_model(
            bnb_4bit_quant_type="nf4",
        )
    try:
-        if cfg.gptq and cfg.is_llama_derived_model:
-            from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
-            from huggingface_hub import snapshot_download
-
-            try:
-                snapshot_download_kwargs = {}
-                if cfg.base_model_ignore_patterns:
-                    snapshot_download_kwargs[
-                        "ignore_patterns"
-                    ] = cfg.base_model_ignore_patterns
-                cache_model_path = Path(
-                    snapshot_download(base_model, **snapshot_download_kwargs)
-                )
-                files = (
-                    list(cache_model_path.glob("*.pt"))
-                    + list(cache_model_path.glob("*.safetensors"))
-                    + list(cache_model_path.glob("*.bin"))
-                )
-                if len(files) > 0:
-                    model_path = str(files[0])
-                else:
-                    LOG.warning(
-                        "unable to find a cached model file, this will likely fail..."
-                    )
-                    model_path = str(cache_model_path)
-            except Exception:  # pylint: disable=broad-exception-caught
-                model_path = cfg.base_model
-            model, _ = load_llama_model_4bit_low_ram(
-                base_model_config if base_model_config else base_model,
-                model_path,
-                device_map=cfg.device_map,
-                half=cfg.fp16,
-                groupsize=cfg.gptq_groupsize if cfg.gptq_groupsize else -1,
-                is_v1_model=cfg.gptq_model_v1
-                if cfg.gptq_model_v1 is not None
-                else True,
-            )
-            load_in_8bit = False
-        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
+        if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
            from transformers import LlamaForCausalLM

            config_kwargs = {}
@@ -275,15 +230,24 @@ def load_model(
        #     )
        #     model.train() # sets to train instead of eval mode
        elif model_type and not cfg.trust_remote_code:
-            model = getattr(transformers, model_type).from_pretrained(
-                base_model,
-                device_map=cfg.device_map,
-                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
-                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
-                torch_dtype=cfg.torch_dtype,
-                trust_remote_code=cfg.trust_remote_code or False,
-                **model_kwargs,
-            )
+            if cfg.gptq:
+                model = AutoModelForCausalLM.from_pretrained(
+                    base_model,
+                    device_map=cfg.device_map,
+                    torch_dtype=cfg.torch_dtype,
+                    trust_remote_code=cfg.trust_remote_code or False,
+                    **model_kwargs,
+                )
+            else:
+                model = getattr(transformers, model_type).from_pretrained(
+                    base_model,
+                    device_map=cfg.device_map,
+                    load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
+                    load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
+                    torch_dtype=cfg.torch_dtype,
+                    trust_remote_code=cfg.trust_remote_code or False,
+                    **model_kwargs,
+                )
        else:
            config = AutoConfig.from_pretrained(
                base_model,
@@ -359,11 +323,12 @@ def load_model(
                module.to(torch.float32)

    needs_fa2_dtype = cfg.adapter or cfg.fsdp
-    if not cfg.gptq and (
-        (cfg.adapter == "lora" and load_in_8bit)
-        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
+    if (cfg.adapter == "lora" and load_in_8bit) or (
+        cfg.adapter == "qlora" and cfg.load_in_4bit
    ):
        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
+        if cfg.gradient_checkpointing:
+            model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=cfg.gradient_checkpointing
        )
@@ -385,22 +350,10 @@ def load_model(
    if cfg.ddp and not load_in_8bit:
        model.to(f"cuda:{cfg.local_rank}")

-    if cfg.gptq:
-        # Scales to half
-        LOG.info("Fitting 4bit scales and zeros to half")
-        for _, module in model.named_modules():
-            if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
-                type(module)
-            ):
-                if hasattr(module, "is_v1_model") and module.is_v1_model:
-                    module.zeros = module.zeros.half()
-                module.scales = module.scales.half()
-                module.bias = module.bias.half()
-
    if (
        torch.cuda.device_count() > 1
        and int(os.getenv("WORLD_SIZE", "1")) > 1
-        and (cfg.gptq or cfg.load_in_4bit)
+        and (cfg.load_in_4bit)
    ):
        # llama is PROBABLY model parallelizable, but the default isn't that it is
        # so let's only set it for the 4bit, see
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -447,23 +447,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        training_arguments_kwargs["seed"] = cfg.seed

    if cfg.gradient_checkpointing:
-        if cfg.gptq:
-            from alpaca_lora_4bit.gradient_checkpointing import (
-                apply_gradient_checkpointing,
-            )
-
-            gradient_checkpointing_ratio = (
-                cfg.gradient_checkpointing_ratio
-                if cfg.gradient_checkpointing_ratio
-                else 1.0
-            )
-            apply_gradient_checkpointing(
-                model, checkpoint_ratio=gradient_checkpointing_ratio
-            )
-        else:
-            training_arguments_kwargs[
-                "gradient_checkpointing"
-            ] = cfg.gradient_checkpointing
+        training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
    if cfg.fsdp:
        training_arguments_kwargs["fsdp"] = cfg.fsdp
        if cfg.fsdp_config: