feat(doc): add info on how to use dapo / dr grpo and misc doc fixes (#2673) [skip ci]

* feat(doc): add info on how to use dapo / dr grpo * chore: add missing config to docs * fix: missing comment * fix: add missing scheduler from schema * chore: refactor lr scheduler docs * fix: remove log_sweep
2025-05-28 15:51:04 +07:00
parent add2025253
commit 6b6370f4e3
3 changed files with 50 additions and 9 deletions
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -242,16 +242,12 @@
 # early_stopping_patience: 3

 # # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+# lr_scheduler: # 'one_cycle' | empty for cosine
 # lr_scheduler_kwargs:

 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor

-# # For log_sweep optim
-# log_sweep_min_lr:
-# log_sweep_max_lr:
-
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -272,10 +272,25 @@ trl:

  num_generations: # Optional[int]. Number of generations to sample.
  log_completions: # Optional[bool]. Whether to log completions.
+  num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.

  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
+  scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
+
+  temperature: # Optional[float]. Sampling temperature for the GRPO policy.
+  top_p: # Optional[float]. Top-p sampling probability for the generation policy.
+  top_k: # Optional[int]. Top-k sampling for the generation policy.
+  min_p: # Optional[float]. Minimum probability for the generation policy.
+  repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
+
+  num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
+  epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
+  epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
+  use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
+  loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
+  mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.


 # reward modelling: `True` or `False`
@@ -553,7 +568,24 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
+# Valid values are driven by the Transformers SchedulerType class, see:
+# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
+# Valid values include
+# - 'linear'
+# - 'cosine' (default)
+# - 'cosine_with_restarts'
+# - 'polynomial'
+# - 'constant'
+# - 'constant_with_warmup'
+# - 'inverse_sqrt'
+# - 'reduce_lr_on_plateau'
+# - 'cosine_with_min_lr'
+# - 'warmup_stable_decay'
+
+# Additional schedulers include:
+# - 'one_cycle'
+# - 'rex'
+lr_scheduler:
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -571,7 +603,7 @@ lr_div_factor: # Learning rate div factor
 #
 # Valid values for 'optimizer' include:
 # - adamw_torch
-# - adamw_torch_fused
+# - adamw_torch_fused (default)
 # - adamw_torch_xla
 # - adamw_torch_npu_fused
 # - adamw_apex_fused
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,7 +16,7 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl)
+- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)


 ## RLHF using Axolotl
@@ -582,7 +582,20 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
+To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
+
+#### GRPO with DAPO/Dr. GRPO loss
+
+The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
+
+```yaml
+trl:
+  loss_type: dr_grpo
+  # Normalizes loss based on max completion length (default: 256)
+  max_completion_length:
+```
+
+For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).

 ### SimPO