add float16 docs and tweak typehints
This commit is contained in:
@@ -264,6 +264,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|||||||
bf16: true # require >=ampere
|
bf16: true # require >=ampere
|
||||||
fp16: true
|
fp16: true
|
||||||
tf32: true # require >=ampere
|
tf32: true # require >=ampere
|
||||||
|
bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP
|
||||||
|
float16: true # use instead of fp16 when you don't want AMP
|
||||||
```
|
```
|
||||||
Note: Repo does not do 4-bit quantization.
|
Note: Repo does not do 4-bit quantization.
|
||||||
|
|
||||||
@@ -522,6 +524,12 @@ Add below flag to train command above
|
|||||||
--merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
|
--merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you run out of CUDA memory, you can try to merge in system RAM with
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ...
|
||||||
|
```
|
||||||
|
|
||||||
## Common Errors 🧰
|
## Common Errors 🧰
|
||||||
|
|
||||||
> Cuda out of memory
|
> Cuda out of memory
|
||||||
|
|||||||
@@ -11,13 +11,14 @@ import bitsandbytes as bnb
|
|||||||
import torch
|
import torch
|
||||||
import transformers
|
import transformers
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from transformers import PreTrainedModel # noqa: F401
|
from transformers import ( # noqa: F401
|
||||||
from transformers import (
|
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
LlamaConfig,
|
LlamaConfig,
|
||||||
|
PreTrainedModel,
|
||||||
|
PreTrainedTokenizerBase,
|
||||||
)
|
)
|
||||||
|
|
||||||
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
|
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
|
||||||
@@ -71,7 +72,7 @@ def load_tokenizer(
|
|||||||
def load_model(
|
def load_model(
|
||||||
base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
|
base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
|
||||||
):
|
):
|
||||||
# type: (str, str, str, AutoTokenizer, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
# type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
||||||
"""
|
"""
|
||||||
Load a model from a base model and a model type.
|
Load a model from a base model and a model type.
|
||||||
"""
|
"""
|
||||||
@@ -284,6 +285,7 @@ def load_model(
|
|||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
base_model,
|
base_model,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
device_map=cfg.device_map,
|
device_map=cfg.device_map,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
|
|||||||
Reference in New Issue
Block a user