add new sharegpt, refactor prompt so it can be customized later, add exception if no data is processed

This commit is contained in:
Wing Lian
2023-06-11 18:46:26 -04:00
parent f31a338cbb
commit aac4b7691e
6 changed files with 90 additions and 17 deletions

View File

@@ -33,12 +33,16 @@ class TokenizedPromptDataset(IterableDataset):
def __iter__(self):
iterator = iter(self.dataset)
count = 0
# Loop through the entire dataset
for example in iterator:
try:
yield self.prompt_tokenizer.tokenize_prompt(example)
count += 1
except InvalidDataException:
pass
if count == 0:
raise RuntimeError("Expected at least one datapoint in dataset.")
# TODO this isn't the best since it can't interleave datasets