fix new dataset prompt tokenizers

2023-05-21 18:57:09 -04:00
parent e0602a9e54
commit 0f74464652
5 changed files with 151 additions and 12 deletions
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -106,7 +106,7 @@ class ConstantLengthDataset(IterableDataset):
                            }
                        else:
                            logging.warning(
-                                "dropping batch due to tensor size mismatch"
+                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
                            )
                    buffer = {"input_ids": [], "attention_mask": [], "labels": []}
                    buffer_len = 0