Compare commits
8 Commits
kd-logprob
...
autogptq-t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0026fcc3df | ||
|
|
b448c77148 | ||
|
|
c820d04669 | ||
|
|
588cd65a64 | ||
|
|
caa80e891d | ||
|
|
ac37753aa2 | ||
|
|
a29560004b | ||
|
|
1deb767fe8 |
10
.github/workflows/main.yml
vendored
10
.github/workflows/main.yml
vendored
@@ -23,11 +23,6 @@ jobs:
|
|||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 118
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.9"
|
|
||||||
pytorch: 2.0.1
|
|
||||||
axolotl_extras: gptq
|
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -73,11 +68,6 @@ jobs:
|
|||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
is_latest: true
|
||||||
- cuda: 118
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.9"
|
|
||||||
pytorch: 2.0.1
|
|
||||||
axolotl_extras: gptq
|
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -e .[peft]
|
pip install -e .
|
||||||
pip install -r requirements-tests.txt
|
pip install -r requirements-tests.txt
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
|
|||||||
@@ -11,14 +11,13 @@ RUN apt-get update && \
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main"
|
|
||||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN cd axolotl && \
|
RUN cd axolotl && \
|
||||||
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install -e .[flash-attn,$AXOLOTL_EXTRAS]; \
|
pip install -e .[flash-attn,gptq,$AXOLOTL_EXTRAS]; \
|
||||||
else \
|
else \
|
||||||
pip install -e .[flash-attn]; \
|
pip install -e .[flash-attn,gptq]; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works
|
# fix so that git fetch/pull from remote works
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
# LLaMa 7B using LoRA
|
|
||||||
|
|
||||||
This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
|
|
||||||
|
|
||||||
```
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
base_model: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
|
|
||||||
base_model_config: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: LlamaTokenizer
|
|
||||||
trust_remote_code:
|
|
||||||
load_in_8bit: true
|
|
||||||
gptq: true
|
|
||||||
datasets:
|
|
||||||
- path: vicgalle/alpaca-gpt4
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.02
|
|
||||||
adapter:
|
|
||||||
lora_model_dir:
|
|
||||||
sequence_len: 2048
|
|
||||||
max_packed_sequence_len:
|
|
||||||
lora_r: 8
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
lora_fan_in_fan_out: false
|
|
||||||
wandb_project: llama-7b-lora-int4
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
output_dir: ./llama-7b-lora-int4
|
|
||||||
gradient_accumulation_steps: 1
|
|
||||||
micro_batch_size: 1
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
torchdistx_path:
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0000002
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
fp16: true
|
|
||||||
bf16: false
|
|
||||||
tf32: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 5
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention:
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gptq_groupsize: 128
|
|
||||||
gptq_model_v1: false
|
|
||||||
warmup_steps: 20
|
|
||||||
eval_steps: 110
|
|
||||||
save_steps: 660
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0001
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
tokens:
|
|
||||||
pad_token: "<pad>"
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
76
examples/llama-2/gptq-lora.yml
Normal file
76
examples/llama-2/gptq-lora.yml
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
base_model: TheBloke/Llama-2-7B-GPTQ
|
||||||
|
base_model_config: TheBloke/Llama-2-7B-GPTQ
|
||||||
|
is_llama_derived_model: false
|
||||||
|
gptq: true
|
||||||
|
gptq_bits: 4
|
||||||
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: LlamaTokenizer
|
||||||
|
tokenizer_use_fast: true
|
||||||
|
tokenizer_legacy: true
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
push_dataset_to_hub:
|
||||||
|
hf_use_auth_token: true
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing:
|
||||||
|
lora_r: 8
|
||||||
|
lora_alpha: 32
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
|
- k_proj
|
||||||
|
- o_proj
|
||||||
|
- q_proj
|
||||||
|
- v_proj
|
||||||
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
wandb_project:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
output_dir: ./model-out
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_torch
|
||||||
|
adam_beta2: 0.95
|
||||||
|
adam_eps: 0.00001
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
torchdistx_path:
|
||||||
|
lr_scheduler: cosine
|
||||||
|
lr_quadratic_warmup: true
|
||||||
|
learning_rate: 0.000017
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: false
|
||||||
|
fp16: false
|
||||||
|
float16: true
|
||||||
|
tf32: true
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention:
|
||||||
|
sdp_attention:
|
||||||
|
flash_optimum:
|
||||||
|
gptq_groupsize:
|
||||||
|
gptq_model_v1:
|
||||||
|
warmup_steps: 100
|
||||||
|
eval_steps:
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.1
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
@@ -1,3 +1,7 @@
|
|||||||
|
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
|
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||||
|
torch==2.0.1
|
||||||
|
auto-gptq
|
||||||
packaging
|
packaging
|
||||||
peft @ git+https://github.com/huggingface/peft.git
|
peft @ git+https://github.com/huggingface/peft.git
|
||||||
transformers @ git+https://github.com/huggingface/transformers.git
|
transformers @ git+https://github.com/huggingface/transformers.git
|
||||||
|
|||||||
39
setup.py
39
setup.py
@@ -2,15 +2,27 @@
|
|||||||
|
|
||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
install_requires = []
|
|
||||||
with open("./requirements.txt", encoding="utf-8") as requirements_file:
|
def parse_requirements():
|
||||||
# don't include peft yet until we check the int4
|
_install_requires = []
|
||||||
# need to manually install peft for now...
|
_dependency_links = []
|
||||||
reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
|
with open("./requirements.txt", encoding="utf-8") as requirements_file:
|
||||||
reqs = [r for r in reqs if "flash-attn" not in r]
|
lines = [
|
||||||
reqs = [r for r in reqs if r and r[0] != "#"]
|
r.strip() for r in requirements_file.readlines() if "auto-gptq" not in r
|
||||||
for r in reqs:
|
]
|
||||||
install_requires.append(r)
|
for line in lines:
|
||||||
|
if line.startswith("--extra-index-url"):
|
||||||
|
# Handle custom index URLs
|
||||||
|
_, url = line.split()
|
||||||
|
_dependency_links.append(url)
|
||||||
|
elif "flash-attn" not in line and line and line[0] != "#":
|
||||||
|
# Handle standard packages
|
||||||
|
_install_requires.append(line)
|
||||||
|
return _install_requires, _dependency_links
|
||||||
|
|
||||||
|
|
||||||
|
install_requires, dependency_links = parse_requirements()
|
||||||
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="axolotl",
|
name="axolotl",
|
||||||
@@ -19,12 +31,10 @@ setup(
|
|||||||
package_dir={"": "src"},
|
package_dir={"": "src"},
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
install_requires=install_requires,
|
install_requires=install_requires,
|
||||||
|
dependency_links=dependency_links,
|
||||||
extras_require={
|
extras_require={
|
||||||
"gptq": [
|
"gptq": [
|
||||||
"alpaca_lora_4bit @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
|
"auto-gptq",
|
||||||
],
|
|
||||||
"gptq_triton": [
|
|
||||||
"alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
|
|
||||||
],
|
],
|
||||||
"flash-attn": [
|
"flash-attn": [
|
||||||
"flash-attn==2.0.8",
|
"flash-attn==2.0.8",
|
||||||
@@ -32,8 +42,5 @@ setup(
|
|||||||
"extras": [
|
"extras": [
|
||||||
"deepspeed",
|
"deepspeed",
|
||||||
],
|
],
|
||||||
"peft": [
|
|
||||||
"peft @ git+https://github.com/huggingface/peft.git",
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -97,9 +97,7 @@ def validate_config(cfg):
|
|||||||
"To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
|
"To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
|
||||||
)
|
)
|
||||||
if cfg.load_4bit:
|
if cfg.load_4bit:
|
||||||
raise ValueError(
|
raise ValueError("cfg.load_4bit parameter has been deprecated")
|
||||||
"cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
|
|
||||||
)
|
|
||||||
|
|
||||||
if cfg.adapter == "qlora":
|
if cfg.adapter == "qlora":
|
||||||
if cfg.merge_lora:
|
if cfg.merge_lora:
|
||||||
|
|||||||
@@ -4,19 +4,19 @@
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, Tuple # noqa: F401
|
from typing import Optional, Tuple # noqa: F401
|
||||||
|
|
||||||
import bitsandbytes as bnb
|
import bitsandbytes as bnb
|
||||||
import torch
|
import torch
|
||||||
import transformers
|
import transformers
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from peft import PeftConfig
|
from peft import PeftConfig, prepare_model_for_kbit_training
|
||||||
from transformers import ( # noqa: F401
|
from transformers import ( # noqa: F401
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
|
GPTQConfig,
|
||||||
LlamaConfig,
|
LlamaConfig,
|
||||||
PreTrainedModel,
|
PreTrainedModel,
|
||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
@@ -155,32 +155,15 @@ def load_model(
|
|||||||
LOG.info("patching _expand_mask")
|
LOG.info("patching _expand_mask")
|
||||||
hijack_expand_mask()
|
hijack_expand_mask()
|
||||||
|
|
||||||
try:
|
|
||||||
if cfg.gptq:
|
|
||||||
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
|
|
||||||
replace_peft_model_with_int4_lora_model,
|
|
||||||
)
|
|
||||||
|
|
||||||
replace_peft_model_with_int4_lora_model()
|
|
||||||
except Exception as err:
|
|
||||||
LOG.exception(err)
|
|
||||||
raise err
|
|
||||||
|
|
||||||
if not cfg.gptq and (
|
|
||||||
(cfg.adapter == "lora" and load_in_8bit)
|
|
||||||
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
from peft import prepare_model_for_kbit_training
|
|
||||||
except ImportError:
|
|
||||||
# For backward compatibility
|
|
||||||
from peft import (
|
|
||||||
prepare_model_for_int8_training as prepare_model_for_kbit_training,
|
|
||||||
)
|
|
||||||
|
|
||||||
model_kwargs = {}
|
model_kwargs = {}
|
||||||
if cfg.model_revision:
|
if cfg.model_revision:
|
||||||
model_kwargs["revision"] = cfg.model_revision
|
model_kwargs["revision"] = cfg.model_revision
|
||||||
|
if cfg.gptq:
|
||||||
|
# TODO we should figure out how read the models config.json first
|
||||||
|
model_kwargs["quantization_config"] = GPTQConfig(
|
||||||
|
bits=cfg.gptq_bits,
|
||||||
|
disable_exllama=True,
|
||||||
|
)
|
||||||
if cfg.adapter == "qlora" and cfg.load_in_4bit:
|
if cfg.adapter == "qlora" and cfg.load_in_4bit:
|
||||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
@@ -191,45 +174,7 @@ def load_model(
|
|||||||
bnb_4bit_quant_type="nf4",
|
bnb_4bit_quant_type="nf4",
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
if cfg.gptq and cfg.is_llama_derived_model:
|
if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
|
||||||
from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
try:
|
|
||||||
snapshot_download_kwargs = {}
|
|
||||||
if cfg.base_model_ignore_patterns:
|
|
||||||
snapshot_download_kwargs[
|
|
||||||
"ignore_patterns"
|
|
||||||
] = cfg.base_model_ignore_patterns
|
|
||||||
cache_model_path = Path(
|
|
||||||
snapshot_download(base_model, **snapshot_download_kwargs)
|
|
||||||
)
|
|
||||||
files = (
|
|
||||||
list(cache_model_path.glob("*.pt"))
|
|
||||||
+ list(cache_model_path.glob("*.safetensors"))
|
|
||||||
+ list(cache_model_path.glob("*.bin"))
|
|
||||||
)
|
|
||||||
if len(files) > 0:
|
|
||||||
model_path = str(files[0])
|
|
||||||
else:
|
|
||||||
LOG.warning(
|
|
||||||
"unable to find a cached model file, this will likely fail..."
|
|
||||||
)
|
|
||||||
model_path = str(cache_model_path)
|
|
||||||
except Exception: # pylint: disable=broad-exception-caught
|
|
||||||
model_path = cfg.base_model
|
|
||||||
model, _ = load_llama_model_4bit_low_ram(
|
|
||||||
base_model_config if base_model_config else base_model,
|
|
||||||
model_path,
|
|
||||||
device_map=cfg.device_map,
|
|
||||||
half=cfg.fp16,
|
|
||||||
groupsize=cfg.gptq_groupsize if cfg.gptq_groupsize else -1,
|
|
||||||
is_v1_model=cfg.gptq_model_v1
|
|
||||||
if cfg.gptq_model_v1 is not None
|
|
||||||
else True,
|
|
||||||
)
|
|
||||||
load_in_8bit = False
|
|
||||||
elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
|
|
||||||
from transformers import LlamaForCausalLM
|
from transformers import LlamaForCausalLM
|
||||||
|
|
||||||
config_kwargs = {}
|
config_kwargs = {}
|
||||||
@@ -275,15 +220,24 @@ def load_model(
|
|||||||
# )
|
# )
|
||||||
# model.train() # sets to train instead of eval mode
|
# model.train() # sets to train instead of eval mode
|
||||||
elif model_type and not cfg.trust_remote_code:
|
elif model_type and not cfg.trust_remote_code:
|
||||||
model = getattr(transformers, model_type).from_pretrained(
|
if cfg.gptq:
|
||||||
base_model,
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
device_map=cfg.device_map,
|
base_model,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
device_map=cfg.device_map,
|
||||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
torch_dtype=cfg.torch_dtype,
|
||||||
torch_dtype=cfg.torch_dtype,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
**model_kwargs,
|
||||||
**model_kwargs,
|
)
|
||||||
)
|
else:
|
||||||
|
model = getattr(transformers, model_type).from_pretrained(
|
||||||
|
base_model,
|
||||||
|
device_map=cfg.device_map,
|
||||||
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
|
torch_dtype=cfg.torch_dtype,
|
||||||
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
|
**model_kwargs,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
config = AutoConfig.from_pretrained(
|
config = AutoConfig.from_pretrained(
|
||||||
base_model,
|
base_model,
|
||||||
@@ -359,11 +313,12 @@ def load_model(
|
|||||||
module.to(torch.float32)
|
module.to(torch.float32)
|
||||||
|
|
||||||
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
||||||
if not cfg.gptq and (
|
if (cfg.adapter == "lora" and load_in_8bit) or (
|
||||||
(cfg.adapter == "lora" and load_in_8bit)
|
cfg.adapter == "qlora" and cfg.load_in_4bit
|
||||||
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
|
||||||
):
|
):
|
||||||
LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
|
LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
|
||||||
|
if cfg.gradient_checkpointing:
|
||||||
|
model.gradient_checkpointing_enable()
|
||||||
model = prepare_model_for_kbit_training(
|
model = prepare_model_for_kbit_training(
|
||||||
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
||||||
)
|
)
|
||||||
@@ -385,22 +340,10 @@ def load_model(
|
|||||||
if cfg.ddp and not load_in_8bit:
|
if cfg.ddp and not load_in_8bit:
|
||||||
model.to(f"cuda:{cfg.local_rank}")
|
model.to(f"cuda:{cfg.local_rank}")
|
||||||
|
|
||||||
if cfg.gptq:
|
|
||||||
# Scales to half
|
|
||||||
LOG.info("Fitting 4bit scales and zeros to half")
|
|
||||||
for _, module in model.named_modules():
|
|
||||||
if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
|
|
||||||
type(module)
|
|
||||||
):
|
|
||||||
if hasattr(module, "is_v1_model") and module.is_v1_model:
|
|
||||||
module.zeros = module.zeros.half()
|
|
||||||
module.scales = module.scales.half()
|
|
||||||
module.bias = module.bias.half()
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
torch.cuda.device_count() > 1
|
torch.cuda.device_count() > 1
|
||||||
and int(os.getenv("WORLD_SIZE", "1")) > 1
|
and int(os.getenv("WORLD_SIZE", "1")) > 1
|
||||||
and (cfg.gptq or cfg.load_in_4bit)
|
and (cfg.load_in_4bit)
|
||||||
):
|
):
|
||||||
# llama is PROBABLY model parallelizable, but the default isn't that it is
|
# llama is PROBABLY model parallelizable, but the default isn't that it is
|
||||||
# so let's only set it for the 4bit, see
|
# so let's only set it for the 4bit, see
|
||||||
|
|||||||
@@ -447,23 +447,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
training_arguments_kwargs["seed"] = cfg.seed
|
training_arguments_kwargs["seed"] = cfg.seed
|
||||||
|
|
||||||
if cfg.gradient_checkpointing:
|
if cfg.gradient_checkpointing:
|
||||||
if cfg.gptq:
|
training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
|
||||||
from alpaca_lora_4bit.gradient_checkpointing import (
|
|
||||||
apply_gradient_checkpointing,
|
|
||||||
)
|
|
||||||
|
|
||||||
gradient_checkpointing_ratio = (
|
|
||||||
cfg.gradient_checkpointing_ratio
|
|
||||||
if cfg.gradient_checkpointing_ratio
|
|
||||||
else 1.0
|
|
||||||
)
|
|
||||||
apply_gradient_checkpointing(
|
|
||||||
model, checkpoint_ratio=gradient_checkpointing_ratio
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
training_arguments_kwargs[
|
|
||||||
"gradient_checkpointing"
|
|
||||||
] = cfg.gradient_checkpointing
|
|
||||||
if cfg.fsdp:
|
if cfg.fsdp:
|
||||||
training_arguments_kwargs["fsdp"] = cfg.fsdp
|
training_arguments_kwargs["fsdp"] = cfg.fsdp
|
||||||
if cfg.fsdp_config:
|
if cfg.fsdp_config:
|
||||||
|
|||||||
Reference in New Issue
Block a user