Add debug option for RL dataset preprocessing (#1404)

* adding debug option for RL dataset preprocessing * Refine formatting of debugging code in RL dataset preprocessing * Update __init__.py * chore: fix lint --------- Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
2024-04-30 21:06:04 +05:30
parent 1aeece6e24
commit cc5d31e0d9
2 changed files with 75 additions and 3 deletions
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -433,6 +433,23 @@ def load_rl_datasets(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )

+    if cli_args.debug or cfg.debug:
+        LOG.info("check_dataset_labels...")
+
+        tokenizer = load_tokenizer(cfg)
+        check_dataset_labels(
+            train_dataset.select(
+                [
+                    random.randrange(0, len(train_dataset) - 1)  # nosec
+                    for _ in range(cli_args.debug_num_examples)
+                ]
+            ),
+            tokenizer,
+            num_examples=cli_args.debug_num_examples,
+            text_only=cli_args.debug_text_only,
+            rl_mode=True,
+        )
+
    return TrainDatasetMeta(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,