This commit is contained in:
Wing Lian
2023-09-08 01:58:15 -04:00
parent 65f3a4f703
commit fb8ee37ca6
2 changed files with 43 additions and 20 deletions

View File

@@ -1,10 +1,14 @@
"""Benchmarking and measurement utilities""" """Benchmarking and measurement utilities"""
import functools import functools
import logging
import pynvml import pynvml
import torch import torch
from pynvml.nvml import NVMLError from pynvml.nvml import NVMLError
LOG = logging.getLogger("axolotl.utils.bench")
def check_cuda_device(default_value): def check_cuda_device(default_value):
""" """
@@ -62,7 +66,14 @@ def gpu_memory_usage_smi(device=0):
def log_gpu_memory_usage(log, msg, device): def log_gpu_memory_usage(log, msg, device):
usage, cache, misc = gpu_memory_usage_all(device) if not torch.cuda.is_available():
return (0, 0, 0)
try:
usage, cache, misc = gpu_memory_usage_all(device)
except ValueError as exc:
LOG.exception(exc)
return (0, 0, 0)
extras = [] extras = []
if cache > 0: if cache > 0:
extras.append(f"+{cache:.03f}GB cache") extras.append(f"+{cache:.03f}GB cache")

View File

@@ -4,7 +4,6 @@ import math
import os import os
from typing import Optional, Tuple # noqa: F401 from typing import Optional, Tuple # noqa: F401
import accelerate
import bitsandbytes as bnb import bitsandbytes as bnb
import tensor_parallel as tp import tensor_parallel as tp
import torch import torch
@@ -328,14 +327,21 @@ def load_model(
base_model, base_model,
trust_remote_code=cfg.trust_remote_code or False, trust_remote_code=cfg.trust_remote_code or False,
) )
with accelerate.init_empty_weights(): # with accelerate.init_empty_weights():
model = AutoModelForCausalLM.from_config( # model = AutoModelForCausalLM.from_config(
config=config, # config=config,
trust_remote_code=cfg.trust_remote_code or False, # trust_remote_code=cfg.trust_remote_code or False,
).half() # ).half()
model = tp.TensorParallelPreTrainedModel( # model = tp.TensorParallelPreTrainedModel(
model, # model,
) # sharded=False,
# )
model = AutoModelForCausalLM.from_pretrained(
base_model,
config=config,
trust_remote_code=cfg.trust_remote_code or False,
).half()
model = tp.tensor_parallel(model, sharded=False)
else: else:
config = AutoConfig.from_pretrained( config = AutoConfig.from_pretrained(
base_model, base_model,
@@ -386,15 +392,18 @@ def load_model(
**model_kwargs, **model_kwargs,
) )
embeddings_len = ( try:
math.ceil(len(tokenizer) / 32) * 32 embeddings_len = (
if cfg.resize_token_embeddings_to_32x math.ceil(len(tokenizer) / 32) * 32
else len(tokenizer) if cfg.resize_token_embeddings_to_32x
) else len(tokenizer)
if model.get_input_embeddings().num_embeddings < embeddings_len: )
model.resize_token_embeddings(embeddings_len) if model.get_input_embeddings().num_embeddings < embeddings_len:
else: model.resize_token_embeddings(embeddings_len)
model.tie_weights() else:
model.tie_weights()
except NotImplementedError:
LOG.warning("`resize_token_embeddings` not implemented on model")
if ( if (
hasattr(model.config, "max_position_embeddings") hasattr(model.config, "max_position_embeddings")
@@ -497,7 +506,10 @@ def load_adapter(model, cfg, adapter, inference=False):
if adapter is None: if adapter is None:
return model, None return model, None
if hasattr(model, "enable_input_require_grads"): if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads() try:
model.enable_input_require_grads()
except NotImplementedError:
LOG.warning("enable_input_require_grads not implemented on model")
if adapter == "qlora" and cfg.tensor_parallel: if adapter == "qlora" and cfg.tensor_parallel:
return load_tp_qlora(model) return load_tp_qlora(model)
if adapter in ["lora", "qlora"]: if adapter in ["lora", "qlora"]: