diff --git a/src/axolotl/utils/bench.py b/src/axolotl/utils/bench.py index 40be0d9ac..2111fe819 100644 --- a/src/axolotl/utils/bench.py +++ b/src/axolotl/utils/bench.py @@ -1,10 +1,14 @@ """Benchmarking and measurement utilities""" import functools +import logging + import pynvml import torch from pynvml.nvml import NVMLError +LOG = logging.getLogger("axolotl.utils.bench") + def check_cuda_device(default_value): """ @@ -62,7 +66,14 @@ def gpu_memory_usage_smi(device=0): def log_gpu_memory_usage(log, msg, device): - usage, cache, misc = gpu_memory_usage_all(device) + if not torch.cuda.is_available(): + return (0, 0, 0) + + try: + usage, cache, misc = gpu_memory_usage_all(device) + except ValueError as exc: + LOG.exception(exc) + return (0, 0, 0) extras = [] if cache > 0: extras.append(f"+{cache:.03f}GB cache") diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index f61b27a90..988ed29ba 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -4,7 +4,6 @@ import math import os from typing import Optional, Tuple # noqa: F401 -import accelerate import bitsandbytes as bnb import tensor_parallel as tp import torch @@ -328,14 +327,21 @@ def load_model( base_model, trust_remote_code=cfg.trust_remote_code or False, ) - with accelerate.init_empty_weights(): - model = AutoModelForCausalLM.from_config( - config=config, - trust_remote_code=cfg.trust_remote_code or False, - ).half() - model = tp.TensorParallelPreTrainedModel( - model, - ) + # with accelerate.init_empty_weights(): + # model = AutoModelForCausalLM.from_config( + # config=config, + # trust_remote_code=cfg.trust_remote_code or False, + # ).half() + # model = tp.TensorParallelPreTrainedModel( + # model, + # sharded=False, + # ) + model = AutoModelForCausalLM.from_pretrained( + base_model, + config=config, + trust_remote_code=cfg.trust_remote_code or False, + ).half() + model = tp.tensor_parallel(model, sharded=False) else: config = AutoConfig.from_pretrained( base_model, @@ -386,15 +392,18 @@ def load_model( **model_kwargs, ) - embeddings_len = ( - math.ceil(len(tokenizer) / 32) * 32 - if cfg.resize_token_embeddings_to_32x - else len(tokenizer) - ) - if model.get_input_embeddings().num_embeddings < embeddings_len: - model.resize_token_embeddings(embeddings_len) - else: - model.tie_weights() + try: + embeddings_len = ( + math.ceil(len(tokenizer) / 32) * 32 + if cfg.resize_token_embeddings_to_32x + else len(tokenizer) + ) + if model.get_input_embeddings().num_embeddings < embeddings_len: + model.resize_token_embeddings(embeddings_len) + else: + model.tie_weights() + except NotImplementedError: + LOG.warning("`resize_token_embeddings` not implemented on model") if ( hasattr(model.config, "max_position_embeddings") @@ -497,7 +506,10 @@ def load_adapter(model, cfg, adapter, inference=False): if adapter is None: return model, None if hasattr(model, "enable_input_require_grads"): - model.enable_input_require_grads() + try: + model.enable_input_require_grads() + except NotImplementedError: + LOG.warning("enable_input_require_grads not implemented on model") if adapter == "qlora" and cfg.tensor_parallel: return load_tp_qlora(model) if adapter in ["lora", "qlora"]: