Process reward models (#2241)

* adding model_cfg to set num_labels

* using a num_labels field instead

* linting

* WIP stepwise prompt tokenizer

* this should work?

* trainer working?

* pushing to runpod

* fixing saving

* updating conf

* updating config, adding docs

* adding stepwise supervision docpage

* updating tests

* adding test for dataset

* fixing tests

* linting

* addressing some comments

* adding additional cfg fields support

* updating tests, fixing cfg

* fixing tests

* updating loss

* Update test_process_reward_model_smollm2.py

* updating loss values and seed

* dumb pre-commit
This commit is contained in:
salman
2025-01-29 05:08:33 +00:00
committed by GitHub
parent c071a530f7
commit 54dd7abfc1
17 changed files with 542 additions and 25 deletions

View File

@@ -44,6 +44,8 @@ from trl import (
KTOTrainer,
ORPOConfig,
ORPOTrainer,
PRMConfig,
PRMTrainer,
RewardConfig,
RewardTrainer,
)
@@ -342,6 +344,13 @@ class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig):
"""
@dataclass
class AxolotlPRMConfig(AxolotlTrainingMixins, PRMConfig):
"""
PRM config for PRM training
"""
class SchedulerMixin(Trainer):
"""
Mixin class for scheduler setup in CausalTrainer.
@@ -1244,6 +1253,14 @@ class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
tag_names = ["axolotl", "reward"]
class AxolotlPRMTrainer(SchedulerMixin, PRMTrainer):
"""
Extend the base trl.PRMTrainer for axolotl helpers
"""
tag_names = ["axolotl", "prm"]
class TrainerBuilderBase(abc.ABC):
"""
Base class for trainer builder
@@ -1377,7 +1394,8 @@ class TrainerBuilderBase(abc.ABC):
class HFCausalTrainerBuilder(TrainerBuilderBase):
"""
Build the HuggingFace training args/trainer for Causal models
Build the HuggingFace training args/trainer for causal models
and reward modelling using TRL.
"""
def get_callbacks(self):
@@ -1452,6 +1470,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
return AxolotlMambaTrainer
if self.cfg.reward_model:
return AxolotlRewardTrainer
if self.cfg.process_reward_model:
return AxolotlPRMTrainer
return AxolotlTrainer
def build(self, total_num_steps):
@@ -1842,11 +1862,13 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
"accelerator_config"
] = self.cfg.accelerator_config
training_args_cls = (
AxolotlTrainingArguments
if not self.cfg.reward_model
else AxolotlRewardConfig
)
if self.cfg.reward_model:
training_args_cls = AxolotlRewardConfig
elif self.cfg.process_reward_model:
training_args_cls = AxolotlPRMConfig
else:
training_args_cls = AxolotlTrainingArguments
training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
**training_arguments_kwargs,
)
@@ -1880,9 +1902,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if eval_data_collator := self.build_collator(
training_args, is_eval=True, **data_collator_kwargs
):
if not self.cfg.reward_model:
if not (self.cfg.reward_model or self.cfg.process_reward_model):
trainer_kwargs["eval_data_collator"] = eval_data_collator
if not self.cfg.reward_model:
if not (self.cfg.reward_model or self.cfg.process_reward_model):
trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
self.tokenizer,
return_tensors="pt",
@@ -1893,8 +1915,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
trainer_kwargs["processing_class"] = self.tokenizer
else:
trainer_kwargs["tokenizer"] = self.tokenizer
if (trainer_cls is not AxolotlRewardTrainer) and self.cfg.datasets is not None:
if (
not (trainer_cls in [AxolotlRewardTrainer, AxolotlPRMTrainer])
and self.cfg.datasets is not None
):
trainer_kwargs["dataset_tags"] = [
d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
]
@@ -1984,7 +2008,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
class HFRLTrainerBuilder(TrainerBuilderBase):
"""
Trainer factory class for DPO Trainer
Trainer factory class for TRL-based RLHF trainers (e.g. DPO)
"""
def get_callbacks(self):

View File

@@ -52,6 +52,7 @@ class TokenizedPromptDataset(Dataset):
if self.prompt_tokenizer.supports_batched:
map_kwargs["batched"] = True
map_kwargs["batch_size"] = 100
return dataset.map(
self.prompt_tokenizer.tokenize_prompt,
num_proc=num_proc,

View File

@@ -0,0 +1,116 @@
"""
Module for stepwise datasets, typically including a prompt and reasoning traces,
and (optionally) per-step, or per-prompt-trace labels for reward modelling.
"""
from itertools import chain
from typing import Dict, List, Optional, Union
from transformers import BatchEncoding, PreTrainedTokenizer
from axolotl.prompt_tokenizers import IGNORE_INDEX
from axolotl.utils.dict import DictDefault
class StepwiseSupervisedPromptTokenizingStrategy:
"""
Tokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.
These datasets should include the following columns:
- prompt: the prompt text
- completions: a list of `n` completion steps
- labels: a list of `n` labels indicating the "correctness" of each step
"""
def __init__(
self,
tokenizer,
sequence_len: int = 2048,
step_separator: str = "\n",
max_completion_length: Optional[int] = None,
train_on_last_step_only: bool = False,
):
self.tokenizer = tokenizer
self.sequence_len = sequence_len
self.step_separator = step_separator
self.max_completion_length = max_completion_length
self.train_on_last_step_only = train_on_last_step_only
def tokenize_prompt(
self, prompt: Dict[str, Union[str, List[str]]]
) -> BatchEncoding:
# Inspired by TRL's PRMTRainer
# https://github.com/huggingface/trl/blob/ed7de87dc766478c024b68f12530d1b0e7c3ff23/trl/trainer/prm_trainer.py#L206
prompt_ids = self.tokenizer(prompt["prompt"], add_special_tokens=False)[
"input_ids"
]
completions_ids = [
self.tokenizer(completion, add_special_tokens=False)["input_ids"]
for completion in prompt["completions"]
]
# Handle labels
if self.train_on_last_step_only:
labels = [IGNORE_INDEX] * (len(prompt["labels"]) - 1) + [
int(prompt["labels"][-1])
]
else:
labels = [int(label) for label in prompt["labels"]]
# Add step separators
separator_ids = self.tokenizer.encode(
self.step_separator, add_special_tokens=False
)
completions_ids = [completion + separator_ids for completion in completions_ids]
# Create step-wise labels
labels = [
[IGNORE_INDEX] * (len(completion) - 1) + [label] # type: ignore
for completion, label in zip(completions_ids, labels)
]
# Join all steps
completion_ids = list(chain(*completions_ids))
labels = list(chain(*labels)) # type: ignore
# Handle max lengths
if self.max_completion_length:
completion_ids = completion_ids[: self.max_completion_length]
labels = labels[: self.max_completion_length]
# Add BOS token if model has one
if self.tokenizer.bos_token_id is not None:
prompt_ids = [self.tokenizer.bos_token_id] + prompt_ids
# Combine prompt and completion
input_ids = prompt_ids + completion_ids
full_labels = [IGNORE_INDEX] * len(prompt_ids) + labels
# Apply max sequence length
if self.sequence_len:
input_ids = input_ids[: self.sequence_len]
full_labels = full_labels[: self.sequence_len]
return {
"input_ids": input_ids,
"labels": full_labels,
"attention_mask": [1] * len(input_ids),
}
@property
def supports_batched(self):
return False
def load(
tokenizer: PreTrainedTokenizer,
cfg: DictDefault,
ds_cfg: DictDefault,
) -> StepwiseSupervisedPromptTokenizingStrategy:
return StepwiseSupervisedPromptTokenizingStrategy(
tokenizer,
cfg.sequence_len,
step_separator=ds_cfg.get("step_separator", "\n"),
max_completion_length=ds_cfg.max_completion_length,
train_on_last_step_only=ds_cfg.get("train_on_last_step_only", False),
)

View File

@@ -259,7 +259,7 @@ def train(
.decode("utf-8")
}
if cfg.datasets is not None:
if cfg.rl is not None or cfg.reward_model:
if cfg.rl is not None or cfg.reward_model or cfg.process_reward_model:
dataset_tags = [
d["path"] for d in cfg.datasets if not Path(d["path"]).is_dir()
]

View File

@@ -236,6 +236,18 @@ class DPODataset(BaseModel):
revision: Optional[str] = None
class StepwiseSupervisedDataset(BaseModel):
"""Stepwise supervised dataset configuration subset"""
path: Optional[str] = None
split: Optional[str] = None
data_files: Optional[List[str]] = None
revision: Optional[str] = None
step_separator: Optional[str] = None
max_completion_length: Optional[int] = None
train_on_last_step_only: Optional[bool] = None
class UserDefinedKTOType(BaseModel):
"""User defined typing for KTO"""
@@ -626,12 +638,14 @@ class AxolotlInputConfig(
rl: Optional[RLType] = None
reward_model: Optional[bool] = None
process_reward_model: Optional[bool] = None
num_labels: Optional[int] = None
dpo_use_weighting: Optional[
bool
] = None # whether to use weighting in DPO trainer. If none, default is false in the trainer.
datasets: Optional[conlist(Union[SFTDataset, DPODataset, KTODataset], min_length=1)] = None # type: ignore
test_datasets: Optional[conlist(Union[SFTDataset, DPODataset, KTODataset], min_length=1)] = None # type: ignore
datasets: Optional[conlist(Union[SFTDataset, DPODataset, KTODataset, StepwiseSupervisedDataset], min_length=1)] = None # type: ignore
test_datasets: Optional[conlist(Union[SFTDataset, DPODataset, KTODataset, StepwiseSupervisedDataset], min_length=1)] = None # type: ignore
shuffle_merged_datasets: Optional[bool] = True
dataset_prepared_path: Optional[str] = None
dataset_shard_num: Optional[int] = None

View File

@@ -8,6 +8,8 @@ from typing import List, Tuple, Union
from datasets import (
Dataset,
DatasetDict,
Sequence,
Value,
concatenate_datasets,
load_dataset,
load_from_disk,
@@ -467,6 +469,17 @@ def get_dataset_wrapper(
dataset,
**ds_kwargs,
)
elif config_dataset.type.startswith("stepwise_supervised"):
dataset_prompter = UnsupportedPrompter()
ds_strategy = load(config_dataset.type, tokenizer, cfg, config_dataset)
# we need to explicitly cast boolean labels to int
# for compatibility with how trl's PRMTrainer works
dataset = dataset.cast_column("labels", Sequence(Value("int64")))
dataset_wrapper = TokenizedPromptDataset(
ds_strategy,
dataset,
**ds_kwargs,
)
elif ds_strategy := load(
config_dataset.type, tokenizer, cfg, config_dataset, processor=processor
):

View File

@@ -138,7 +138,9 @@ def load_model_config(cfg):
config_kwargs = {}
if cfg.revision_of_model:
config_kwargs["revision"] = cfg.revision_of_model
if cfg.num_labels:
# num_labels is used to initialize classifier models
config_kwargs["num_labels"] = cfg.num_labels
try:
model_config = AutoConfig.from_pretrained(
model_config_name,