improve handling for empty text on the tokenization step (#502)

2023-09-19 08:09:56 -04:00
parent 62a774140b
commit 1eebbd09c3
1 changed files with 16 additions and 9 deletions
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -6,7 +6,7 @@ import functools
 import logging
 from typing import Dict, List, Tuple, Union
-from transformers import PreTrainedTokenizer
+from transformers import BatchEncoding, PreTrainedTokenizer
 from axolotl.prompters import IGNORE_TOKEN_ID
@@ -66,7 +66,14 @@ class PromptTokenizingStrategy(abc.ABC):
            pass
        return False
-    def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
+    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
    ) -> BatchEncoding:
        result: BatchEncoding
        if not prompt.strip():
            LOG.warning("Empty text requested for tokenization.")
            result = BatchEncoding(data={"input_ids": [], "attention_mask": []})
        else:
            result = self.tokenizer(
                prompt,
                truncation=True,