diff --git a/src/axolotl/integrations/differential_transformer/differential_attention.py b/src/axolotl/integrations/differential_transformer/differential_attention.py index 2046f08bc..1543981ea 100644 --- a/src/axolotl/integrations/differential_transformer/differential_attention.py +++ b/src/axolotl/integrations/differential_transformer/differential_attention.py @@ -262,6 +262,7 @@ class LlamaDifferentialSdpaAttention(LlamaDifferentialAttention): dtype: Data type for the layer parameters """ + # pylint: disable=duplicate-code def forward( self, hidden_states: torch.Tensor, # [bsz, seq_len, hidden_size] @@ -284,8 +285,8 @@ class LlamaDifferentialSdpaAttention(LlamaDifferentialAttention): 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( - hidden_states=hidden_states, # pylint: disable=duplicate-code - attention_mask=attention_mask, # pylint: disable=duplicate-code + hidden_states=hidden_states, + attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, @@ -422,6 +423,7 @@ class LlamaDifferentialFlashAttention2(LlamaDifferentialAttention): dtype: Data type for the layer parameters """ + # pylint: disable=duplicate-code def forward( self, hidden_states: torch.Tensor, # [bsz, seq_len, hidden_size]