Update rlhf.md (#1178) [skip ci]
This commit is contained in:
@@ -19,14 +19,14 @@ The various RL training methods are implemented in trl and wrapped via axolotl.
|
|||||||
|
|
||||||
#### DPO
|
#### DPO
|
||||||
```yaml
|
```yaml
|
||||||
rl: true
|
rl: dpo
|
||||||
datasets:
|
datasets:
|
||||||
- path: Intel/orca_dpo_pairs
|
- path: Intel/orca_dpo_pairs
|
||||||
split: train
|
split: train
|
||||||
type: intel_apply_chatml
|
type: chatml.intel
|
||||||
- path: argilla/ultrafeedback-binarized-preferences
|
- path: argilla/ultrafeedback-binarized-preferences
|
||||||
split: train
|
split: train
|
||||||
type: argilla_apply_chatml
|
type: chatml.argilla
|
||||||
```
|
```
|
||||||
|
|
||||||
#### IPO
|
#### IPO
|
||||||
|
|||||||
Reference in New Issue
Block a user