diff --git a/src/axolotl/integrations/diff_transformer/README.md b/src/axolotl/integrations/diff_transformer/README.md index a3b39bee6..ea27e0291 100644 --- a/src/axolotl/integrations/diff_transformer/README.md +++ b/src/axolotl/integrations/diff_transformer/README.md @@ -24,3 +24,17 @@ plugins: diff_attention: true ``` + +Additional, optional arguments include: + +```yaml +# How often to log diffential attention-related metrics to wandb +diff_attn_log_every: 100 + +# How many differential attention layers to monitor (strided from 0..k..num_layers) +diff_attn_num_monitor_layers: 3 + +# How many steps to "warmup" the mixing parameter for the negative component of differential attention +# Follows a linear warmup schedule from 0 to 1; if not specified, the mixing component is set to 1 +diff_attn_warmup_steps: 1000 +```