fix new dataset prompt tokenizers

This commit is contained in:
Wing Lian
2023-05-21 18:57:09 -04:00
parent e0602a9e54
commit 0f74464652
5 changed files with 151 additions and 12 deletions

View File

@@ -106,7 +106,7 @@ class ConstantLengthDataset(IterableDataset):
}
else:
logging.warning(
"dropping batch due to tensor size mismatch"
f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
)
buffer = {"input_ids": [], "attention_mask": [], "labels": []}
buffer_len = 0