Process reward models (#2241)

* adding model_cfg to set num_labels * using a num_labels field instead * linting * WIP stepwise prompt tokenizer * this should work? * trainer working? * pushing to runpod * fixing saving * updating conf * updating config, adding docs * adding stepwise supervision docpage * updating tests * adding test for dataset * fixing tests * linting * addressing some comments * adding additional cfg fields support * updating tests, fixing cfg * fixing tests * updating loss * Update test_process_reward_model_smollm2.py * updating loss values and seed * dumb pre-commit
2025-01-29 05:08:33 +00:00
parent c071a530f7
commit 54dd7abfc1
17 changed files with 542 additions and 25 deletions
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -8,6 +8,8 @@ from typing import List, Tuple, Union
 from datasets import (
    Dataset,
    DatasetDict,
+    Sequence,
+    Value,
    concatenate_datasets,
    load_dataset,
    load_from_disk,
@@ -467,6 +469,17 @@ def get_dataset_wrapper(
            dataset,
            **ds_kwargs,
        )
+    elif config_dataset.type.startswith("stepwise_supervised"):
+        dataset_prompter = UnsupportedPrompter()
+        ds_strategy = load(config_dataset.type, tokenizer, cfg, config_dataset)
+        # we need to explicitly cast boolean labels to int
+        # for compatibility with how trl's PRMTrainer works
+        dataset = dataset.cast_column("labels", Sequence(Value("int64")))
+        dataset_wrapper = TokenizedPromptDataset(
+            ds_strategy,
+            dataset,
+            **ds_kwargs,
+        )
    elif ds_strategy := load(
        config_dataset.type, tokenizer, cfg, config_dataset, processor=processor
    ):