better handling of empty input ids when tokenizing (#395)

* better handling of empty input ids when tokenizing

* Add warning if tokenizer resulted in empty result

* fix len comparison for linter
This commit is contained in:
Wing Lian
2023-08-15 01:09:59 -04:00
committed by GitHub
parent 2e22404d2d
commit 85cf4f8e2c

View File

@@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC):
padding=False,
return_tensors=None,
)
if len(result["input_ids"]) == 0:
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
if (
result["input_ids"][-1] != self.tokenizer.eos_token_id
len(result["input_ids"]) > 0
and result["input_ids"][-1] != self.tokenizer.eos_token_id
and len(result["input_ids"]) < self.sequence_len
and add_eos_token
):