load the tokenizer seperately from the model

This commit is contained in:
Wing Lian
2023-05-26 07:29:35 -04:00
parent bbfc333a01
commit 32e6fe9286
2 changed files with 62 additions and 54 deletions

View File

@@ -21,7 +21,7 @@ src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)
from axolotl.utils.data import load_prepare_datasets
from axolotl.utils.models import load_model
from axolotl.utils.models import load_model, load_tokenizer
from axolotl.utils.trainer import setup_trainer
from axolotl.utils.wandb import setup_wandb_env_vars
@@ -161,13 +161,30 @@ def train(
validate_config(cfg)
# load the tokenizer first
logging.info("loading tokenizer...")
tokenizer = load_tokenizer(
cfg.base_model_config,
cfg.tokenizer_type,
cfg
)
if "inference" not in kwargs and "shard" not in kwargs: # don't need to load dataset for these
train_dataset, eval_dataset = load_prepare_datasets(
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
)
if prepare_ds_only:
logging.info("Finished preparing dataset. Exiting...")
return
# Load the model and tokenizer
logging.info("loading model, tokenizer, and peft_config...")
model, tokenizer, peft_config = load_model(
logging.info("loading model and peft_config...")
model, peft_config = load_model(
cfg.base_model,
cfg.base_model_config,
cfg.model_type,
cfg.tokenizer_type,
tokenizer,
cfg,
adapter=cfg.adapter,
inference=("inference" in kwargs),
@@ -192,10 +209,6 @@ def train(
model.save_pretrained(cfg.output_dir)
return
train_dataset, eval_dataset = load_prepare_datasets(
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
)
if cfg.debug:
logging.info("check_dataset_labels...")
check_dataset_labels(
@@ -205,10 +218,6 @@ def train(
tokenizer,
)
if prepare_ds_only:
logging.info("Finished preparing dataset. Exiting...")
return
trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
model.config.use_cache = False