This commit is contained in:
Wing Lian
2023-09-08 01:58:15 -04:00
parent 65f3a4f703
commit fb8ee37ca6
2 changed files with 43 additions and 20 deletions

View File

@@ -1,10 +1,14 @@
"""Benchmarking and measurement utilities"""
import functools
import logging
import pynvml
import torch
from pynvml.nvml import NVMLError
LOG = logging.getLogger("axolotl.utils.bench")
def check_cuda_device(default_value):
"""
@@ -62,7 +66,14 @@ def gpu_memory_usage_smi(device=0):
def log_gpu_memory_usage(log, msg, device):
usage, cache, misc = gpu_memory_usage_all(device)
if not torch.cuda.is_available():
return (0, 0, 0)
try:
usage, cache, misc = gpu_memory_usage_all(device)
except ValueError as exc:
LOG.exception(exc)
return (0, 0, 0)
extras = []
if cache > 0:
extras.append(f"+{cache:.03f}GB cache")

View File

@@ -4,7 +4,6 @@ import math
import os
from typing import Optional, Tuple # noqa: F401
import accelerate
import bitsandbytes as bnb
import tensor_parallel as tp
import torch
@@ -328,14 +327,21 @@ def load_model(
base_model,
trust_remote_code=cfg.trust_remote_code or False,
)
with accelerate.init_empty_weights():
model = AutoModelForCausalLM.from_config(
config=config,
trust_remote_code=cfg.trust_remote_code or False,
).half()
model = tp.TensorParallelPreTrainedModel(
model,
)
# with accelerate.init_empty_weights():
# model = AutoModelForCausalLM.from_config(
# config=config,
# trust_remote_code=cfg.trust_remote_code or False,
# ).half()
# model = tp.TensorParallelPreTrainedModel(
# model,
# sharded=False,
# )
model = AutoModelForCausalLM.from_pretrained(
base_model,
config=config,
trust_remote_code=cfg.trust_remote_code or False,
).half()
model = tp.tensor_parallel(model, sharded=False)
else:
config = AutoConfig.from_pretrained(
base_model,
@@ -386,15 +392,18 @@ def load_model(
**model_kwargs,
)
embeddings_len = (
math.ceil(len(tokenizer) / 32) * 32
if cfg.resize_token_embeddings_to_32x
else len(tokenizer)
)
if model.get_input_embeddings().num_embeddings < embeddings_len:
model.resize_token_embeddings(embeddings_len)
else:
model.tie_weights()
try:
embeddings_len = (
math.ceil(len(tokenizer) / 32) * 32
if cfg.resize_token_embeddings_to_32x
else len(tokenizer)
)
if model.get_input_embeddings().num_embeddings < embeddings_len:
model.resize_token_embeddings(embeddings_len)
else:
model.tie_weights()
except NotImplementedError:
LOG.warning("`resize_token_embeddings` not implemented on model")
if (
hasattr(model.config, "max_position_embeddings")
@@ -497,7 +506,10 @@ def load_adapter(model, cfg, adapter, inference=False):
if adapter is None:
return model, None
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
try:
model.enable_input_require_grads()
except NotImplementedError:
LOG.warning("enable_input_require_grads not implemented on model")
if adapter == "qlora" and cfg.tensor_parallel:
return load_tp_qlora(model)
if adapter in ["lora", "qlora"]: