From 85cf4f8e2cc22776c7ba4ab70f77ca08760cf705 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 15 Aug 2023 01:09:59 -0400 Subject: [PATCH] better handling of empty input ids when tokenizing (#395) * better handling of empty input ids when tokenizing * Add warning if tokenizer resulted in empty result * fix len comparison for linter --- src/axolotl/prompt_tokenizers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index e223b6d76..7612f4967 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC): padding=False, return_tensors=None, ) + if len(result["input_ids"]) == 0: + LOG.warning("Tokenizer result is empty. You may want to audit your dataset") if ( - result["input_ids"][-1] != self.tokenizer.eos_token_id + len(result["input_ids"]) > 0 + and result["input_ids"][-1] != self.tokenizer.eos_token_id and len(result["input_ids"]) < self.sequence_len and add_eos_token ):