wip tp
This commit is contained in:
@@ -1,10 +1,14 @@
|
|||||||
"""Benchmarking and measurement utilities"""
|
"""Benchmarking and measurement utilities"""
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
import pynvml
|
import pynvml
|
||||||
import torch
|
import torch
|
||||||
from pynvml.nvml import NVMLError
|
from pynvml.nvml import NVMLError
|
||||||
|
|
||||||
|
LOG = logging.getLogger("axolotl.utils.bench")
|
||||||
|
|
||||||
|
|
||||||
def check_cuda_device(default_value):
|
def check_cuda_device(default_value):
|
||||||
"""
|
"""
|
||||||
@@ -62,7 +66,14 @@ def gpu_memory_usage_smi(device=0):
|
|||||||
|
|
||||||
|
|
||||||
def log_gpu_memory_usage(log, msg, device):
|
def log_gpu_memory_usage(log, msg, device):
|
||||||
usage, cache, misc = gpu_memory_usage_all(device)
|
if not torch.cuda.is_available():
|
||||||
|
return (0, 0, 0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
usage, cache, misc = gpu_memory_usage_all(device)
|
||||||
|
except ValueError as exc:
|
||||||
|
LOG.exception(exc)
|
||||||
|
return (0, 0, 0)
|
||||||
extras = []
|
extras = []
|
||||||
if cache > 0:
|
if cache > 0:
|
||||||
extras.append(f"+{cache:.03f}GB cache")
|
extras.append(f"+{cache:.03f}GB cache")
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import math
|
|||||||
import os
|
import os
|
||||||
from typing import Optional, Tuple # noqa: F401
|
from typing import Optional, Tuple # noqa: F401
|
||||||
|
|
||||||
import accelerate
|
|
||||||
import bitsandbytes as bnb
|
import bitsandbytes as bnb
|
||||||
import tensor_parallel as tp
|
import tensor_parallel as tp
|
||||||
import torch
|
import torch
|
||||||
@@ -328,14 +327,21 @@ def load_model(
|
|||||||
base_model,
|
base_model,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
)
|
)
|
||||||
with accelerate.init_empty_weights():
|
# with accelerate.init_empty_weights():
|
||||||
model = AutoModelForCausalLM.from_config(
|
# model = AutoModelForCausalLM.from_config(
|
||||||
config=config,
|
# config=config,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
# trust_remote_code=cfg.trust_remote_code or False,
|
||||||
).half()
|
# ).half()
|
||||||
model = tp.TensorParallelPreTrainedModel(
|
# model = tp.TensorParallelPreTrainedModel(
|
||||||
model,
|
# model,
|
||||||
)
|
# sharded=False,
|
||||||
|
# )
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
base_model,
|
||||||
|
config=config,
|
||||||
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
|
).half()
|
||||||
|
model = tp.tensor_parallel(model, sharded=False)
|
||||||
else:
|
else:
|
||||||
config = AutoConfig.from_pretrained(
|
config = AutoConfig.from_pretrained(
|
||||||
base_model,
|
base_model,
|
||||||
@@ -386,15 +392,18 @@ def load_model(
|
|||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
embeddings_len = (
|
try:
|
||||||
math.ceil(len(tokenizer) / 32) * 32
|
embeddings_len = (
|
||||||
if cfg.resize_token_embeddings_to_32x
|
math.ceil(len(tokenizer) / 32) * 32
|
||||||
else len(tokenizer)
|
if cfg.resize_token_embeddings_to_32x
|
||||||
)
|
else len(tokenizer)
|
||||||
if model.get_input_embeddings().num_embeddings < embeddings_len:
|
)
|
||||||
model.resize_token_embeddings(embeddings_len)
|
if model.get_input_embeddings().num_embeddings < embeddings_len:
|
||||||
else:
|
model.resize_token_embeddings(embeddings_len)
|
||||||
model.tie_weights()
|
else:
|
||||||
|
model.tie_weights()
|
||||||
|
except NotImplementedError:
|
||||||
|
LOG.warning("`resize_token_embeddings` not implemented on model")
|
||||||
|
|
||||||
if (
|
if (
|
||||||
hasattr(model.config, "max_position_embeddings")
|
hasattr(model.config, "max_position_embeddings")
|
||||||
@@ -497,7 +506,10 @@ def load_adapter(model, cfg, adapter, inference=False):
|
|||||||
if adapter is None:
|
if adapter is None:
|
||||||
return model, None
|
return model, None
|
||||||
if hasattr(model, "enable_input_require_grads"):
|
if hasattr(model, "enable_input_require_grads"):
|
||||||
model.enable_input_require_grads()
|
try:
|
||||||
|
model.enable_input_require_grads()
|
||||||
|
except NotImplementedError:
|
||||||
|
LOG.warning("enable_input_require_grads not implemented on model")
|
||||||
if adapter == "qlora" and cfg.tensor_parallel:
|
if adapter == "qlora" and cfg.tensor_parallel:
|
||||||
return load_tp_qlora(model)
|
return load_tp_qlora(model)
|
||||||
if adapter in ["lora", "qlora"]:
|
if adapter in ["lora", "qlora"]:
|
||||||
|
|||||||
Reference in New Issue
Block a user