better handling of empty input ids when tokenizing (#395)
* better handling of empty input ids when tokenizing * Add warning if tokenizer resulted in empty result * fix len comparison for linter
This commit is contained in:
@@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC):
|
||||
padding=False,
|
||||
return_tensors=None,
|
||||
)
|
||||
if len(result["input_ids"]) == 0:
|
||||
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
|
||||
if (
|
||||
result["input_ids"][-1] != self.tokenizer.eos_token_id
|
||||
len(result["input_ids"]) > 0
|
||||
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
||||
and len(result["input_ids"]) < self.sequence_len
|
||||
and add_eos_token
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user