From 3b477e08a0e67d405daf29acac97ed358183af89 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 10 Mar 2025 16:25:31 +0700
Subject: [PATCH] feat(doc): add more info on RewardModel datasets (#2391)

* fix: reduce title size

* feat(doc): add rm dataset info

* Update docs/reward_modelling.qmd following suggestion

Co-authored-by: salman <salman.mohammadi@outlook.com>

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 docs/reward_modelling.qmd | 13 +++++++++++++
 styles.css                |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/reward_modelling.qmd b/docs/reward_modelling.qmd
index 8baa93424..c9ac5f801 100644
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -28,6 +28,17 @@ val_set_size: 0.1
 eval_steps: 100
 ```
 
+Bradley-Terry chat templates expect single-turn conversations in the following format:
+
+```json
+{
+    "system": "...", // optional
+    "input": "...",
+    "chosen": "...",
+    "rejected": "..."
+}
+```
+
 ### Process Reward Models (PRM)
 
 Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.
@@ -45,3 +56,5 @@ datasets:
 val_set_size: 0.1
 eval_steps: 100
 ```
+
+Please see [stepwise_supervised](dataset-formats/stepwise_supervised.qmd) for more details on the dataset format.
diff --git a/styles.css b/styles.css
index 891349b4b..749ff4366 100644
--- a/styles.css
+++ b/styles.css
@@ -14,7 +14,7 @@
 h1 {
     font-family: var(--font-title);
     font-weight: 400;
-    font-size: 6rem;
+    font-size: 5rem;
     line-height: 1.1;
     letter-spacing: -0.05em;
     font-feature-settings: "ss01" on;