feature: better device mapping for large models (#918)

* fix: improved memory handling when model is bigger than existing VRAM * feature: add lora_on_cpu flag to do LoRA loading on CPU (RAM) For big models where the models are taking up the entire GPU VRAM, the LoRA part will fail unless it is loaded on CPU only. * doc: add README * fix: enable progress bars in do_merge_lora() * doc: mention gpu_memory_limit and lora_on_cpu in merge part of README * Update src/axolotl/utils/models.py Co-authored-by: Wing Lian <wing.lian@gmail.com> * fix: remove deletion of removed model_kwargs key * fix: validate that gpu_memory_limit and max_memory are not both set --------- Co-authored-by: Karl-Johan Alm <kalle@gmail.com> Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-01-05 22:22:21 +09:00
parent 63fb3eb426
commit bdfefaf054
4 changed files with 52 additions and 6 deletions
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -73,7 +73,7 @@ def do_merge_lora(
    safe_serialization = cfg.save_safetensors is True

    LOG.info("running merge of LoRA with base model")
-    model = model.merge_and_unload()
+    model = model.merge_and_unload(progressbar=True)
    model.to(dtype=cfg.torch_dtype)

    if cfg.local_rank == 0:
@@ -81,6 +81,7 @@ def do_merge_lora(
        model.save_pretrained(
            str(Path(cfg.output_dir) / "merged"),
            safe_serialization=safe_serialization,
+            progressbar=True,
        )
        tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))

--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -457,6 +457,11 @@ def validate_config(cfg):
            "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`."
        )

+    if cfg.max_memory is not None and cfg.gpu_memory_limit is not None:
+        raise ValueError(
+            "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
+        )
+
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -2,7 +2,7 @@
 import logging
 import math
 import os
-from typing import Optional, Tuple  # noqa: F401
+from typing import Any, Optional, Tuple  # noqa: F401

 import addict
 import bitsandbytes as bnb
@@ -288,8 +288,37 @@ def load_model(

    model_kwargs = {}

-    model_kwargs["device_map"] = cfg.device_map
-    model_kwargs["max_memory"] = cfg.max_memory
+    max_memory = cfg.max_memory
+    device_map = cfg.device_map
+
+    if cfg.gpu_memory_limit:
+        gpu_memory_limit = (
+            str(cfg.gpu_memory_limit) + "GiB"
+            if isinstance(cfg.gpu_memory_limit, int)
+            else cfg.gpu_memory_limit
+        )
+
+        max_memory = {}
+        for i in range(torch.cuda.device_count()):
+            max_memory[i] = gpu_memory_limit
+        max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
+
+    if max_memory is not None:
+        # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
+        from accelerate import infer_auto_device_map, init_empty_weights
+
+        with init_empty_weights():
+            model_canvas = AutoModelForCausalLM.from_config(model_config)
+        model_canvas.tie_weights()
+        device_map = infer_auto_device_map(
+            model_canvas,
+            max_memory=max_memory,
+            dtype=cfg.torch_dtype,
+        )
+        # We can discard max_memory now as we have a device map set up for us
+        max_memory = None
+
+    model_kwargs["device_map"] = device_map
    model_kwargs["torch_dtype"] = cfg.torch_dtype
    # TODO can we put the reference model on it's own gpu? I think we have to move logits around to calculate loss
    # if cfg.rl:
@@ -426,7 +455,6 @@ def load_model(
            model_kwargs["device"] = torch.cuda.current_device()
            del model_kwargs["torch_dtype"]
            del model_kwargs["device_map"]
-            del model_kwargs["max_memory"]

            model = MambaLMHeadModel.from_pretrained(
                base_model,
@@ -683,10 +711,15 @@ def load_lora(model, cfg, inference=False):

    if cfg.lora_model_dir:
        LOG.debug("Loading pretained PEFT - LoRA")
+        model_kwargs: Any = {}
+        if cfg.lora_on_cpu:
+            model_kwargs["max_memory"] = {"cpu": "256GiB"}
+            model_kwargs["device_map"] = {"": "cpu"}
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
            is_trainable=(not inference),
+            **model_kwargs,
        )
    else:
        model = get_peft_model(model, lora_config)