improve handling for empty text on the tokenization step (#502)

This commit is contained in:
Wing Lian
2023-09-19 08:09:56 -04:00
committed by GitHub
parent 62a774140b
commit 1eebbd09c3

View File

@@ -6,7 +6,7 @@ import functools
import logging import logging
from typing import Dict, List, Tuple, Union from typing import Dict, List, Tuple, Union
from transformers import PreTrainedTokenizer from transformers import BatchEncoding, PreTrainedTokenizer
from axolotl.prompters import IGNORE_TOKEN_ID from axolotl.prompters import IGNORE_TOKEN_ID
@@ -66,14 +66,21 @@ class PromptTokenizingStrategy(abc.ABC):
pass pass
return False return False
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False): def _tokenize(
result = self.tokenizer( self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
prompt, ) -> BatchEncoding:
truncation=True, result: BatchEncoding
max_length=self.sequence_len, if not prompt.strip():
padding=False, LOG.warning("Empty text requested for tokenization.")
return_tensors=None, result = BatchEncoding(data={"input_ids": [], "attention_mask": []})
) else:
result = self.tokenizer(
prompt,
truncation=True,
max_length=self.sequence_len,
padding=False,
return_tensors=None,
)
if len(result["input_ids"]) == 0: if len(result["input_ids"]) == 0:
LOG.warning("Tokenizer result is empty. You may want to audit your dataset") LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
if ( if (