fix: move warning after we've set any overrides (#3589) [skip ci]
This commit is contained in:
@@ -221,14 +221,6 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
|
|||||||
if getattr(tokenizer, attr_name) is None:
|
if getattr(tokenizer, attr_name) is None:
|
||||||
setattr(tokenizer, attr_name, "<|endoftext|>")
|
setattr(tokenizer, attr_name, "<|endoftext|>")
|
||||||
|
|
||||||
# Generic fallback: if tokenizer still has no pad_token, use eos_token
|
|
||||||
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
LOG.warning(
|
|
||||||
"Tokenizer does not have a pad_token, falling back to eos_token: %s",
|
|
||||||
tokenizer.eos_token,
|
|
||||||
)
|
|
||||||
|
|
||||||
additional_special_tokens = None
|
additional_special_tokens = None
|
||||||
if cfg.special_tokens:
|
if cfg.special_tokens:
|
||||||
special_tokens = cfg.special_tokens.to_dict()
|
special_tokens = cfg.special_tokens.to_dict()
|
||||||
@@ -303,6 +295,14 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
|
|||||||
{"additional_special_tokens": additional_special_tokens}
|
{"additional_special_tokens": additional_special_tokens}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Generic fallback: if tokenizer still has no pad_token, use eos_token
|
||||||
|
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
LOG.warning(
|
||||||
|
"Tokenizer does not have a pad_token, falling back to eos_token: %s",
|
||||||
|
tokenizer.eos_token,
|
||||||
|
)
|
||||||
|
|
||||||
if is_main_process():
|
if is_main_process():
|
||||||
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
||||||
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
|
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
|
||||||
|
|||||||
Reference in New Issue
Block a user