Debug tokenization output: Add ability to output text only (no tokens), and/or specify num samples to see (#511)

This commit is contained in:
Tom Jobbins
2023-08-31 22:26:52 +01:00
committed by GitHub
parent 396a7a74fc
commit 48434bec54
3 changed files with 16 additions and 7 deletions

View File

@@ -246,9 +246,14 @@ def load_datasets(
LOG.info("check_dataset_labels...")
check_dataset_labels(
train_dataset.select(
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
[
random.randrange(0, len(train_dataset) - 1) # nosec
for _ in range(cli_args.debug_num_examples)
]
),
tokenizer,
num_examples=cli_args.debug_num_examples,
text_only=cli_args.debug_text_only,
)
return TrainDatasetMeta(