better handling of empty input ids when tokenizing (#395)
* better handling of empty input ids when tokenizing * Add warning if tokenizer resulted in empty result * fix len comparison for linter
This commit is contained in:
@@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC):
|
|||||||
padding=False,
|
padding=False,
|
||||||
return_tensors=None,
|
return_tensors=None,
|
||||||
)
|
)
|
||||||
|
if len(result["input_ids"]) == 0:
|
||||||
|
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
|
||||||
if (
|
if (
|
||||||
result["input_ids"][-1] != self.tokenizer.eos_token_id
|
len(result["input_ids"]) > 0
|
||||||
|
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
||||||
and len(result["input_ids"]) < self.sequence_len
|
and len(result["input_ids"]) < self.sequence_len
|
||||||
and add_eos_token
|
and add_eos_token
|
||||||
):
|
):
|
||||||
|
|||||||
Reference in New Issue
Block a user