fix(dataset): normalize tokenizer config and change hash from tokenizer class to tokenizer path (#1298)

* fix(dataset): normalize tokenizer config and change hash from tokenizer class to tokenizer path

* fix: normalize config
This commit is contained in:
NanoCode012
2024-03-25 15:34:54 +09:00
committed by GitHub
parent 324d59ea0d
commit ff939d8a64
4 changed files with 13 additions and 4 deletions

View File

@@ -134,9 +134,8 @@ def load_tokenizer(cfg):
if cfg.tokenizer_type:
tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
tokenizer_config = cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
tokenizer = tokenizer_cls.from_pretrained(
tokenizer_config,
cfg.tokenizer_config,
trust_remote_code=cfg.trust_remote_code or False,
use_fast=use_fast,
**tokenizer_kwargs,