Compare commits

..

1 Commits

Author SHA1 Message Date
Wing Lian
db86e32cf4 enable hf trasfer and add unzip to image 2023-10-29 04:53:14 -04:00
8 changed files with 22 additions and 116 deletions

View File

@@ -8,6 +8,7 @@ ENV BNB_CUDA_VERSION=$CUDA
ARG PYTORCH_VERSION="2.0.1" ARG PYTORCH_VERSION="2.0.1"
ENV PYTORCH_VERSION=$PYTORCH_VERSION ENV PYTORCH_VERSION=$PYTORCH_VERSION
ENV HF_HUB_ENABLE_HF_TRANSFER=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y vim curl apt-get install -y vim curl

View File

@@ -14,7 +14,7 @@ ARG CUDA="118"
ENV PYTHON_VERSION=$PYTHON_VERSION ENV PYTHON_VERSION=$PYTHON_VERSION
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* \ && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev unzip && rm -rf /var/lib/apt/lists/* \
&& wget \ && wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& mkdir /root/.conda \ && mkdir /root/.conda \

View File

@@ -12,7 +12,3 @@ This usually happens when you run out of system RAM.
> Exitcode -7 while using deepspeed > Exitcode -7 while using deepspeed
Try upgrading deepspeed w: `pip install -U deepspeed` Try upgrading deepspeed w: `pip install -U deepspeed`
> AttributeError: 'DummyOptim' object has no attribute 'step'
You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.

View File

@@ -31,4 +31,3 @@ scikit-learn==1.2.2
pynvml pynvml
art art
fschat==0.2.29 fschat==0.2.29
tensor_parallel

View File

@@ -14,7 +14,6 @@ from functools import partial
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
import tensor_parallel as tp
import torch import torch
import transformers import transformers
from datasets import Dataset from datasets import Dataset
@@ -34,7 +33,6 @@ from axolotl.utils.callbacks import (
) )
from axolotl.utils.collators import DataCollatorForSeq2Seq from axolotl.utils.collators import DataCollatorForSeq2Seq
from axolotl.utils.dataloader import MultipackDistributedDataloader from axolotl.utils.dataloader import MultipackDistributedDataloader
from axolotl.utils.distributed import is_distributed
from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
try: try:
@@ -104,9 +102,6 @@ class AxolotlTrainingArguments(TrainingArguments):
bench_source_max_len: int = field( bench_source_max_len: int = field(
default=2048, metadata={"help": "Maximum source sequence length for bench."} default=2048, metadata={"help": "Maximum source sequence length for bench."}
) )
tensor_parallel: bool = field(
default=False, metadata={"help": "Use tensor parallelism to train"}
)
class AxolotlTrainer(Trainer): class AxolotlTrainer(Trainer):
@@ -251,14 +246,6 @@ class AxolotlTrainer(Trainer):
# return (loss, outputs) if return_outputs else loss # return (loss, outputs) if return_outputs else loss
return super().compute_loss(model, inputs, return_outputs=return_outputs) return super().compute_loss(model, inputs, return_outputs=return_outputs)
def _wrap_model(self, model, training=True, dataloader=None):
if self.args.tensor_parallel:
model = tp.tensor_parallel(model, distributed=is_distributed())
model.hf_device_map = tp.infer_sharded_device_map(model)
else:
model = super()._wrap_model(model, training=training, dataloader=dataloader)
return model
class OneCycleLRSchedulerTrainer(AxolotlTrainer): class OneCycleLRSchedulerTrainer(AxolotlTrainer):
""" """
@@ -384,10 +371,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
return trainer_kwargs, trainer_cls return trainer_kwargs, trainer_cls
def hook_post_create_trainer(self, trainer): def hook_post_create_trainer(self, trainer):
if self.cfg.tensor_parallel: # TODO
trainer.model = trainer.accelerator.prepare_model(
trainer.model, device_placement=True
)
return trainer return trainer
def get_callbacks(self): def get_callbacks(self):
@@ -631,8 +615,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
] = self.cfg.micro_batch_size ] = self.cfg.micro_batch_size
training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps
training_arguments_kwargs["relora_warmup_steps"] = self.cfg.relora_warmup_steps training_arguments_kwargs["relora_warmup_steps"] = self.cfg.relora_warmup_steps
training_arguments_kwargs["tensor_parallel"] = self.cfg.tensor_parallel is True
training_arguments_kwargs = self.hook_pre_create_training_args( training_arguments_kwargs = self.hook_pre_create_training_args(
training_arguments_kwargs training_arguments_kwargs
) )

View File

@@ -1,13 +1,10 @@
"""Benchmarking and measurement utilities""" """Benchmarking and measurement utilities"""
import functools import functools
import logging
import pynvml import pynvml
import torch import torch
from pynvml.nvml import NVMLError from pynvml.nvml import NVMLError
LOG = logging.getLogger("axolotl.utils.bench")
def check_cuda_device(default_value): def check_cuda_device(default_value):
""" """
@@ -65,14 +62,7 @@ def gpu_memory_usage_smi(device=0):
def log_gpu_memory_usage(log, msg, device): def log_gpu_memory_usage(log, msg, device):
if not torch.cuda.is_available(): usage, cache, misc = gpu_memory_usage_all(device)
return (0, 0, 0)
try:
usage, cache, misc = gpu_memory_usage_all(device)
except ValueError as exc:
LOG.exception(exc)
return (0, 0, 0)
extras = [] extras = []
if cache > 0: if cache > 0:
extras.append(f"+{cache:.03f}GB cache") extras.append(f"+{cache:.03f}GB cache")

View File

@@ -369,10 +369,6 @@ def validate_config(cfg):
"If you want to full finetune, please turn off load_in_8bit and load_in_4bit." "If you want to full finetune, please turn off load_in_8bit and load_in_4bit."
) )
if cfg.tensor_parallel and cfg.gradient_checkpointing:
raise ValueError(
"TensorParallelPreTrainedModel does not support gradient checkpointing"
)
# TODO # TODO
# MPT 7b # MPT 7b
# https://github.com/facebookresearch/bitsandbytes/issues/25 # https://github.com/facebookresearch/bitsandbytes/issues/25

View File

@@ -7,7 +7,6 @@ from typing import Optional, Tuple # noqa: F401
import bitsandbytes as bnb import bitsandbytes as bnb
import torch import torch
import transformers import transformers
import transformers.utils.bitsandbytes
from optimum.bettertransformer import BetterTransformer from optimum.bettertransformer import BetterTransformer
from peft import PeftConfig, prepare_model_for_kbit_training from peft import PeftConfig, prepare_model_for_kbit_training
from peft.tuners.lora import QuantLinear from peft.tuners.lora import QuantLinear
@@ -73,6 +72,11 @@ def load_tokenizer(cfg):
# set a pad_token, but use eos_token so we don't add a new token # set a pad_token, but use eos_token so we don't add a new token
tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast": if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
tokenizer.add_special_tokens({"pad_token": "[PAD]"}) tokenizer.add_special_tokens({"pad_token": "[PAD]"})
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -94,11 +98,6 @@ def load_tokenizer(cfg):
] ]
) )
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
return tokenizer return tokenizer
@@ -222,7 +221,7 @@ def load_model(
load_in_4bit=True, load_in_4bit=True,
llm_int8_threshold=6.0, llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False, llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16, bnb_4bit_compute_dtype=cfg.torch_dtype,
bnb_4bit_use_double_quant=True, bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4", bnb_4bit_quant_type="nf4",
) )
@@ -236,12 +235,7 @@ def load_model(
model_kwargs["use_flash_attention_2"] = True model_kwargs["use_flash_attention_2"] = True
try: try:
if ( if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
cfg.is_llama_derived_model
and not cfg.trust_remote_code
and not cfg.gptq
and not cfg.tensor_parallel
):
from transformers import LlamaForCausalLM from transformers import LlamaForCausalLM
config_kwargs = {} config_kwargs = {}
@@ -307,7 +301,7 @@ def load_model(
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
**model_kwargs, **model_kwargs,
) )
elif model_type and not cfg.trust_remote_code and not cfg.tensor_parallel: elif model_type and not cfg.trust_remote_code:
if cfg.gptq: if cfg.gptq:
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
base_model, base_model,
@@ -322,17 +316,6 @@ def load_model(
trust_remote_code=cfg.trust_remote_code or False, trust_remote_code=cfg.trust_remote_code or False,
**model_kwargs, **model_kwargs,
) )
elif cfg.tensor_parallel:
model_kwargs.pop("device_map")
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
low_cpu_mem_usage=True,
offload_state_dict=True,
trust_remote_code=cfg.trust_remote_code or False,
**model_kwargs,
)
else: else:
config = AutoConfig.from_pretrained( config = AutoConfig.from_pretrained(
base_model, base_model,
@@ -383,18 +366,15 @@ def load_model(
**model_kwargs, **model_kwargs,
) )
try: embeddings_len = (
embeddings_len = ( math.ceil(len(tokenizer) / 32) * 32
math.ceil(len(tokenizer) / 32) * 32 if cfg.resize_token_embeddings_to_32x
if cfg.resize_token_embeddings_to_32x else len(tokenizer)
else len(tokenizer) )
) if model.get_input_embeddings().num_embeddings < embeddings_len:
if model.get_input_embeddings().num_embeddings < embeddings_len: model.resize_token_embeddings(embeddings_len)
model.resize_token_embeddings(embeddings_len) else:
else: model.tie_weights()
model.tie_weights()
except NotImplementedError:
LOG.warning("`resize_token_embeddings` not implemented on model")
if ( if (
hasattr(model.config, "max_position_embeddings") hasattr(model.config, "max_position_embeddings")
@@ -406,20 +386,6 @@ def load_model(
) )
model.config.max_position_embeddings = cfg.sequence_len model.config.max_position_embeddings = cfg.sequence_len
if (
hasattr(model.config, "bos_token_id")
and model.config.bos_token_id
and model.config.bos_token_id != tokenizer.bos_token_id
):
model.config.bos_token_id = tokenizer.bos_token_id
if (
hasattr(model.config, "eos_token_id")
and model.config.eos_token_id
and model.config.eos_token_id != tokenizer.eos_token_id
):
model.config.eos_token_id = tokenizer.eos_token_id
if model.device.type == "cuda": if model.device.type == "cuda":
log_gpu_memory_usage(LOG, "after model load", model.device) log_gpu_memory_usage(LOG, "after model load", model.device)
@@ -497,12 +463,7 @@ def load_adapter(model, cfg, adapter, inference=False):
if adapter is None: if adapter is None:
return model, None return model, None
if hasattr(model, "enable_input_require_grads"): if hasattr(model, "enable_input_require_grads"):
try: model.enable_input_require_grads()
model.enable_input_require_grads()
except NotImplementedError:
LOG.warning("enable_input_require_grads not implemented on model")
if adapter == "qlora" and cfg.tensor_parallel:
model, _ = load_tp_qlora(model)
if adapter in ["lora", "qlora"]: if adapter in ["lora", "qlora"]:
return load_lora(model, cfg, inference=inference) return load_lora(model, cfg, inference=inference)
if adapter == "llama-adapter": if adapter == "llama-adapter":
@@ -554,25 +515,6 @@ def find_all_linear_names(model):
return list(lora_module_names) return list(lora_module_names)
def load_tp_qlora(model):
from transformers.utils.bitsandbytes import replace_with_bnb_linear
model = replace_with_bnb_linear(
model,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
),
)
model.is_loaded_in_4bit = True
return model, None
def load_lora(model, cfg, inference=False): def load_lora(model, cfg, inference=False):
# type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]] # type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]