diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml index 4dff37cae..42c5978d5 100644 --- a/.runpod/src/config/config.yaml +++ b/.runpod/src/config/config.yaml @@ -242,16 +242,12 @@ # early_stopping_patience: 3 # # Specify a scheduler and kwargs to use with the optimizer -# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine +# lr_scheduler: # 'one_cycle' | empty for cosine # lr_scheduler_kwargs: # # For one_cycle optim # lr_div_factor: # Learning rate div factor -# # For log_sweep optim -# log_sweep_min_lr: -# log_sweep_max_lr: - # # Specify optimizer # # Valid values are driven by the Transformers OptimizerNames class, see: # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134 diff --git a/docs/config.qmd b/docs/config.qmd index eab8d28ca..369d3db43 100644 --- a/docs/config.qmd +++ b/docs/config.qmd @@ -272,10 +272,25 @@ trl: num_generations: # Optional[int]. Number of generations to sample. log_completions: # Optional[bool]. Whether to log completions. + num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True. sync_ref_model: # Optional[bool]. Whether to sync the reference model. ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model. ref_model_sync_steps: # Optional[int]. Sync steps for the reference model. + scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation. + + temperature: # Optional[float]. Sampling temperature for the GRPO policy. + top_p: # Optional[float]. Top-p sampling probability for the generation policy. + top_k: # Optional[int]. Top-k sampling for the generation policy. + min_p: # Optional[float]. Minimum probability for the generation policy. + repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text. + + num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO. + epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm. + epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm. + use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO. + loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo. + mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation. # reward modelling: `True` or `False` @@ -553,7 +568,24 @@ gradient_checkpointing: false early_stopping_patience: 3 # Specify a scheduler and kwargs to use with the optimizer -lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine +# Valid values are driven by the Transformers SchedulerType class, see: +# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420 +# Valid values include +# - 'linear' +# - 'cosine' (default) +# - 'cosine_with_restarts' +# - 'polynomial' +# - 'constant' +# - 'constant_with_warmup' +# - 'inverse_sqrt' +# - 'reduce_lr_on_plateau' +# - 'cosine_with_min_lr' +# - 'warmup_stable_decay' + +# Additional schedulers include: +# - 'one_cycle' +# - 'rex' +lr_scheduler: lr_scheduler_kwargs: cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf) @@ -571,7 +603,7 @@ lr_div_factor: # Learning rate div factor # # Valid values for 'optimizer' include: # - adamw_torch -# - adamw_torch_fused +# - adamw_torch_fused (default) # - adamw_torch_xla # - adamw_torch_npu_fused # - adamw_apex_fused diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd index 3a8f87d71..e0d3b55e4 100644 --- a/docs/rlhf.qmd +++ b/docs/rlhf.qmd @@ -16,7 +16,7 @@ feedback. Various methods include, but not limited to: - [Identity Preference Optimization (IPO)](#ipo) - [Kahneman-Tversky Optimization (KTO)](#kto) - [Odds Ratio Preference Optimization (ORPO)](#orpo) -- Proximal Policy Optimization (PPO) (not yet supported in axolotl) +- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!) ## RLHF using Axolotl @@ -582,7 +582,20 @@ datasets: To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function). -To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py). +To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py). + +#### GRPO with DAPO/Dr. GRPO loss + +The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses. + +```yaml +trl: + loss_type: dr_grpo + # Normalizes loss based on max completion length (default: 256) + max_completion_length: +``` + +For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types). ### SimPO