diff --git a/src/axolotl/integrations/diff_transformer/patches.py b/src/axolotl/monkeypatch/attention/differential.py similarity index 95% rename from src/axolotl/integrations/diff_transformer/patches.py rename to src/axolotl/monkeypatch/attention/differential.py index 37ad0a981..037a6f0bd 100644 --- a/src/axolotl/integrations/diff_transformer/patches.py +++ b/src/axolotl/monkeypatch/attention/differential.py @@ -3,7 +3,7 @@ from transformers import PreTrainedModel from transformers.models.llama.modeling_llama import LLAMA_ATTENTION_CLASSES -from .multihead_diffattn import ( +from axolotl.integrations.diff_transformer.multihead_diffattn import ( LlamaDifferentialAttention, LlamaDifferentialSdpaAttention, ) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 3b0dcbc2b..e98e9f31b 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -445,7 +445,7 @@ class ModelLoader: patch_mistral_cross_entropy() if self.cfg.diff_attention: - from axolotl.integrations.diff_transformer.patches import ( + from axolotl.monkeypatch.attention.differential import ( patch_llama_attention_classes, )