chore: moved feature map into linear attention

2025-02-05 19:40:11 +07:00
parent 0e6efaa10c
commit 578fa764c8
2 changed files with 333 additions and 338 deletions
--- a/src/axolotl/integrations/lolcats/linear_llama/feature_map.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/feature_map.py
@@ -1,336 +0,0 @@
-"""
-Learnable linear attention feature map classes and functions
-"""
-
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def init_feature_map(name: str, mlp: nn.Module, **kwargs):
-    """
-    Initialize feature map final activation for linear attention
-    """
-    return FeatureMap(activation_name=name, mlp=mlp, **kwargs)
-
-
-def init_feature_map_act(name: str, fullspace: bool = True, **kwargs):
-    """
-    Initialize feature map final activation for linear attention
-    """
-    if name == "softmax_dim" and fullspace:
-        return SoftmaxDim(**kwargs)
-    elif name == "softmax_dim" and not fullspace:
-        return SoftmaxDimHalfspace(**kwargs)
-    elif name == "exp_dim" and fullspace:
-        return Exp(**kwargs)
-    elif name == "exp_dim" and not fullspace:
-        return ExpHalfspace(**kwargs)
-    elif name == "pos_elu":
-        return PosELU(**kwargs)
-    elif name == "relu":
-        return ReLU(**kwargs)
-
-    else:
-        raise NotImplementedError
-
-
-def init_learned_kernel(name: str, **kwargs):
-    """
-    Initialize feature map MLP for linear attention
-    """
-    if name == "untied_head_einsum":
-        return FeatureMapMLP(**kwargs)
-    elif name == "untied_head_adapter":
-        return FeatureMapAdapter(**kwargs)
-    else:
-        raise NotImplementedError
-
-
-class FeatureMap(nn.Module):
-    """
-    Final 'activation' of feature map. Can probably be combined with
-    `FeatureMapMLP` below
-
-    Full feature map is like f(xW + b)
-    -> This is the `f` part
-    """
-
-    def __init__(
-        self,
-        activation_name: str,
-        head_dim_idx: int = -1,
-        eps: float = 1e-12,
-        mlp: Optional[nn.Module] = None,
-        fullspace: bool = True,
-    ):
-        super().__init__()
-        self.head_dim_idx = head_dim_idx
-        self.eps = eps
-        self.mlp = mlp if mlp is not None else nn.Identity()
-        self.activation = init_feature_map_act(activation_name, fullspace, eps=eps)
-
-    def forward(self, x: torch.Tensor, *mlp_args, **mlp_kwargs):
-        """
-        Assume x.shape is (batch_size, n_heads, seq_len, head_dim)
-        """
-        return self.activation(self.mlp(x, *mlp_args, **mlp_kwargs), x)
-
-    def q_map(self, *args, **kwargs):
-        """
-        Use for inference in case q and k feature maps differ
-        """
-        return self.forward(*args, **kwargs)
-
-    def k_map(self, *args, **kwargs):
-        """
-        Use for inference in case q and k feature maps differ
-        """
-        return self.forward(*args, **kwargs)
-
-
-# -----------------------
-# Feature map activations
-# -----------------------
-class FeatureMapAct(nn.Module):
-    """
-    Base class for feature map activations
-    """
-
-    def __init__(self, eps: float = 1e-12):
-        super().__init__()
-        self.eps = eps
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        """
-        x.shape is (batch_size, n_heads, seq_len, head_dim)
-        """
-        return x
-
-
-class PosELU(FeatureMapAct):
-    """
-    1 + ELU activation as in https://arxiv.org/abs/2006.16236
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return (1 + F.elu(x)).clamp(min=self.eps)
-
-
-class ReLU(FeatureMapAct):
-    """
-    ReLU activation as in https://arxiv.org/abs/2103.13076
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return F.relu(x).clamp(min=self.eps)
-
-
-class SoftmaxDim(FeatureMapAct):
-    """
-    Softmax activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return torch.cat(
-            [torch.softmax(x, dim=-1), torch.softmax(-x, dim=-1)], dim=-1
-        ).clamp(min=self.eps)
-
-
-class SoftmaxDimHalfspace(FeatureMapAct):
-    """
-    Softmax activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return torch.softmax(x, dim=-1).clamp(min=self.eps)
-
-
-class Exp(FeatureMapAct):
-    """
-    Exp activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        x_max = torch.amax(x, dim=-1, keepdim=True)
-        x_min = torch.amin(x, dim=-1, keepdim=True)
-        return torch.cat([torch.exp(x - x_max), torch.exp(-x + x_min)], dim=-1).clamp(
-            min=self.eps
-        )
-
-
-class ExpHalfspace(FeatureMapAct):
-    """
-    Exp activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        x_max = torch.amax(x, dim=-1, keepdim=True)
-        return torch.exp(x - x_max).clamp(min=self.eps)
-
-
-# ----------------
-# Feature map MLPs
-# ----------------
-
-
-class FeatureMapMLP(nn.Module):
-    """
-    Learnable MLP in feature map.
-
-    Full feature map is like f(xW + b)
-    -> This is the `W` and (optional) `b` part
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_dim: int,  # input dim
-        feature_dim: int,  # output dim
-        dtype: torch.dtype,
-        device: torch.device,
-        skip_connection: bool = False,
-        bias: bool = False,
-        zero_init: bool = False,
-        normal_init: bool = False,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.feature_dim = feature_dim
-        self.dtype = dtype
-        self.device = device
-        self.skip_connection = skip_connection
-        self.bias = bias
-        self.zero_init = zero_init
-        self.normal_init = normal_init
-        self.init_weights_()
-
-        if self.zero_init:  # Zero-out weights or set as identity post-initialization
-            self.zero_init_with_skip_() if self.skip_connection else self.zero_init_()
-
-        if self.normal_init:
-            with torch.no_grad():
-                nn.init.normal_(self.layer)
-
-        if self.skip_connection:
-            assertion_fail = f"If self.skip_connection we need self.head_dim == self.feature_dim but self.head_dim is {self.head_dim} != self.feature_dim is {self.feature_dim}"
-            assert self.head_dim == self.feature_dim, assertion_fail
-
-    def init_weights_(self):
-        """
-        Initialize (W)eights and (b)iases
-        """
-        self.layer = nn.Parameter(
-            torch.zeros(
-                (self.num_heads, self.head_dim, self.feature_dim),
-                dtype=self.dtype,
-                device=self.device,
-            )
-        )
-        nn.init.kaiming_uniform_(self.layer)
-
-        if self.bias:
-            self.bias = nn.Parameter(
-                torch.zeros(
-                    (1, self.num_heads, 1, 1),  # self.feature_dim),
-                    dtype=self.dtype,
-                    device=self.device,
-                )
-            )
-            nn.init.kaiming_uniform_(self.bias)
-        else:
-            self.bias = 0.0  # hack
-
-    def zero_init_with_skip_(self):
-        """
-        Initialize weights to zero matrix if skip connection
-        """
-        with torch.no_grad():
-            nn.init.zeros_(self.layer)
-
-    def zero_init_(self):
-        """
-        Initialize weights to identity matrix if no skip connection
-        """
-        with torch.no_grad():
-            for i in range(self.layer.shape[0]):
-                try:
-                    nn.init.eye_(self.layer[i])
-                except RuntimeError:
-                    with torch.no_grad():
-                        dtype = self.layer[i].dtype
-                        weight = torch.eye(
-                            *self.layer[i].shape,
-                            requires_grad=self.layer[i].requires_grad,
-                            device=self.layer[i].device,
-                        )
-                        self.layer[i] = weight.to(dtype=dtype)
-
-    def forward(self, x: torch.Tensor):
-        """
-        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
-        """
-        _x = torch.einsum("hdf,bhld->bhlf", self.layer, x) + self.bias
-        return x + _x if self.skip_connection else _x
-
-
-class FeatureMapAdapter(FeatureMapMLP):
-    """
-    Learnable Feature map with bottleneck adapter
-    as in https://arxiv.org/abs/1902.00751
-
-    We don't use but could be fun to try
-    """
-
-    def __init__(self, hidden_dim: int, *args, **kwargs):
-        kwargs["skip_connection"] = True
-        kwargs["bias"] = True
-        kwargs["zero_init"] = True
-        self.hidden_dim = hidden_dim
-        super().__init__(*args, **kwargs)
-
-    def init_weights_(self):
-        """
-        Initialize (W)eights and (b)iases
-        """
-        kwargs = {"dtype": self.dtype, "device": self.device}
-        self.layer0 = nn.Parameter(
-            torch.zeros((self.num_heads, self.head_dim, self.hidden_dim), **kwargs)
-        )
-        self.layer1 = nn.Parameter(
-            torch.zeros((self.num_heads, self.hidden_dim, self.feature_dim), **kwargs)
-        )
-        nn.init.kaiming_uniform_(self.layer0)
-        nn.init.kaiming_uniform_(self.layer1)
-
-        self.bias0 = nn.Parameter(
-            torch.zeros((1, self.num_heads, 1, self.hidden_dim), **kwargs)
-        )
-        self.bias1 = nn.Parameter(
-            torch.zeros((1, self.num_heads, 1, self.feature_dim), **kwargs)
-        )
-        nn.init.kaiming_uniform_(self.bias0)
-        nn.init.kaiming_uniform_(self.bias1)
-
-    def zero_init_with_skip_(self):
-        with torch.no_grad():
-            nn.init.zeros_(self.layer0)
-            nn.init.zeros_(self.layer1)
-            nn.init.zeros_(self.bias0)
-            nn.init.zeros_(self.bias1)
-
-    def zero_init_(self):
-        raise NotImplementedError
-
-    def forward(self, x: torch.Tensor):
-        """
-        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
-        -> Down-project, apply nonlinearity, up-project; add skip connection
-        """
-        _x = torch.einsum("hde,bhld->bhle", self.layer0, x) + self.bias0
-        _x = F.relu(_x)
-        _x = torch.einsum("hef,bhle->bhlf", self.layer1, _x) + self.bias1
-        return x + _x if self.skip_connection else _x
--- a/src/axolotl/integrations/lolcats/linear_llama/linear_attention.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/linear_attention.py
@@ -7,6 +7,7 @@ from typing import Any, List, Optional, Tuple

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers.cache_utils import Cache

 # Causal linear attention dot product CUDA kernel from fast-transformers
@@ -17,8 +18,6 @@ except ImportError:

 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv

-from .feature_map import init_feature_map, init_learned_kernel
-
 # -------------------
 # Attention functions
 # -------------------
@@ -523,3 +522,335 @@ class LinearAttentionState(Cache):
        raise NotImplementedError(
            "Reordering cache not implemented for LinearAttentionState"
        )
+
+
+# -------------------
+# feature map functions
+# -------------------
+
+
+def init_feature_map(name: str, mlp: nn.Module, **kwargs):
+    """
+    Initialize feature map final activation for linear attention
+    """
+    return FeatureMap(activation_name=name, mlp=mlp, **kwargs)
+
+
+def init_feature_map_act(name: str, fullspace: bool = True, **kwargs):
+    """
+    Initialize feature map final activation for linear attention
+    """
+    if name == "softmax_dim" and fullspace:
+        return SoftmaxDim(**kwargs)
+    elif name == "softmax_dim" and not fullspace:
+        return SoftmaxDimHalfspace(**kwargs)
+    elif name == "exp_dim" and fullspace:
+        return Exp(**kwargs)
+    elif name == "exp_dim" and not fullspace:
+        return ExpHalfspace(**kwargs)
+    elif name == "pos_elu":
+        return PosELU(**kwargs)
+    elif name == "relu":
+        return ReLU(**kwargs)
+
+    else:
+        raise NotImplementedError
+
+
+def init_learned_kernel(name: str, **kwargs):
+    """
+    Initialize feature map MLP for linear attention
+    """
+    if name == "untied_head_einsum":
+        return FeatureMapMLP(**kwargs)
+    elif name == "untied_head_adapter":
+        return FeatureMapAdapter(**kwargs)
+    else:
+        raise NotImplementedError
+
+
+class FeatureMap(nn.Module):
+    """
+    Final 'activation' of feature map. Can probably be combined with
+    `FeatureMapMLP` below
+
+    Full feature map is like f(xW + b)
+    -> This is the `f` part
+    """
+
+    def __init__(
+        self,
+        activation_name: str,
+        head_dim_idx: int = -1,
+        eps: float = 1e-12,
+        mlp: Optional[nn.Module] = None,
+        fullspace: bool = True,
+    ):
+        super().__init__()
+        self.head_dim_idx = head_dim_idx
+        self.eps = eps
+        self.mlp = mlp if mlp is not None else nn.Identity()
+        self.activation = init_feature_map_act(activation_name, fullspace, eps=eps)
+
+    def forward(self, x: torch.Tensor, *mlp_args, **mlp_kwargs):
+        """
+        Assume x.shape is (batch_size, n_heads, seq_len, head_dim)
+        """
+        return self.activation(self.mlp(x, *mlp_args, **mlp_kwargs), x)
+
+    def q_map(self, *args, **kwargs):
+        """
+        Use for inference in case q and k feature maps differ
+        """
+        return self.forward(*args, **kwargs)
+
+    def k_map(self, *args, **kwargs):
+        """
+        Use for inference in case q and k feature maps differ
+        """
+        return self.forward(*args, **kwargs)
+
+
+# -----------------------
+# Feature map activations
+# -----------------------
+class FeatureMapAct(nn.Module):
+    """
+    Base class for feature map activations
+    """
+
+    def __init__(self, eps: float = 1e-12):
+        super().__init__()
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        """
+        x.shape is (batch_size, n_heads, seq_len, head_dim)
+        """
+        return x
+
+
+class PosELU(FeatureMapAct):
+    """
+    1 + ELU activation as in https://arxiv.org/abs/2006.16236
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return (1 + F.elu(x)).clamp(min=self.eps)
+
+
+class ReLU(FeatureMapAct):
+    """
+    ReLU activation as in https://arxiv.org/abs/2103.13076
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return F.relu(x).clamp(min=self.eps)
+
+
+class SoftmaxDim(FeatureMapAct):
+    """
+    Softmax activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return torch.cat(
+            [torch.softmax(x, dim=-1), torch.softmax(-x, dim=-1)], dim=-1
+        ).clamp(min=self.eps)
+
+
+class SoftmaxDimHalfspace(FeatureMapAct):
+    """
+    Softmax activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return torch.softmax(x, dim=-1).clamp(min=self.eps)
+
+
+class Exp(FeatureMapAct):
+    """
+    Exp activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        x_max = torch.amax(x, dim=-1, keepdim=True)
+        x_min = torch.amin(x, dim=-1, keepdim=True)
+        return torch.cat([torch.exp(x - x_max), torch.exp(-x + x_min)], dim=-1).clamp(
+            min=self.eps
+        )
+
+
+class ExpHalfspace(FeatureMapAct):
+    """
+    Exp activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        x_max = torch.amax(x, dim=-1, keepdim=True)
+        return torch.exp(x - x_max).clamp(min=self.eps)
+
+
+# ----------------
+# Feature map MLPs
+# ----------------
+
+
+class FeatureMapMLP(nn.Module):
+    """
+    Learnable MLP in feature map.
+
+    Full feature map is like f(xW + b)
+    -> This is the `W` and (optional) `b` part
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,  # input dim
+        feature_dim: int,  # output dim
+        dtype: torch.dtype,
+        device: torch.device,
+        skip_connection: bool = False,
+        bias: bool = False,
+        zero_init: bool = False,
+        normal_init: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.feature_dim = feature_dim
+        self.dtype = dtype
+        self.device = device
+        self.skip_connection = skip_connection
+        self.bias = bias
+        self.zero_init = zero_init
+        self.normal_init = normal_init
+        self.init_weights_()
+
+        if self.zero_init:  # Zero-out weights or set as identity post-initialization
+            self.zero_init_with_skip_() if self.skip_connection else self.zero_init_()
+
+        if self.normal_init:
+            with torch.no_grad():
+                nn.init.normal_(self.layer)
+
+        if self.skip_connection:
+            assertion_fail = f"If self.skip_connection we need self.head_dim == self.feature_dim but self.head_dim is {self.head_dim} != self.feature_dim is {self.feature_dim}"
+            assert self.head_dim == self.feature_dim, assertion_fail
+
+    def init_weights_(self):
+        """
+        Initialize (W)eights and (b)iases
+        """
+        self.layer = nn.Parameter(
+            torch.zeros(
+                (self.num_heads, self.head_dim, self.feature_dim),
+                dtype=self.dtype,
+                device=self.device,
+            )
+        )
+        nn.init.kaiming_uniform_(self.layer)
+
+        if self.bias:
+            self.bias = nn.Parameter(
+                torch.zeros(
+                    (1, self.num_heads, 1, 1),  # self.feature_dim),
+                    dtype=self.dtype,
+                    device=self.device,
+                )
+            )
+            nn.init.kaiming_uniform_(self.bias)
+        else:
+            self.bias = 0.0  # hack
+
+    def zero_init_with_skip_(self):
+        """
+        Initialize weights to zero matrix if skip connection
+        """
+        with torch.no_grad():
+            nn.init.zeros_(self.layer)
+
+    def zero_init_(self):
+        """
+        Initialize weights to identity matrix if no skip connection
+        """
+        with torch.no_grad():
+            for i in range(self.layer.shape[0]):
+                try:
+                    nn.init.eye_(self.layer[i])
+                except RuntimeError:
+                    with torch.no_grad():
+                        dtype = self.layer[i].dtype
+                        weight = torch.eye(
+                            *self.layer[i].shape,
+                            requires_grad=self.layer[i].requires_grad,
+                            device=self.layer[i].device,
+                        )
+                        self.layer[i] = weight.to(dtype=dtype)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
+        """
+        _x = torch.einsum("hdf,bhld->bhlf", self.layer, x) + self.bias
+        return x + _x if self.skip_connection else _x
+
+
+class FeatureMapAdapter(FeatureMapMLP):
+    """
+    Learnable Feature map with bottleneck adapter
+    as in https://arxiv.org/abs/1902.00751
+
+    We don't use but could be fun to try
+    """
+
+    def __init__(self, hidden_dim: int, *args, **kwargs):
+        kwargs["skip_connection"] = True
+        kwargs["bias"] = True
+        kwargs["zero_init"] = True
+        self.hidden_dim = hidden_dim
+        super().__init__(*args, **kwargs)
+
+    def init_weights_(self):
+        """
+        Initialize (W)eights and (b)iases
+        """
+        kwargs = {"dtype": self.dtype, "device": self.device}
+        self.layer0 = nn.Parameter(
+            torch.zeros((self.num_heads, self.head_dim, self.hidden_dim), **kwargs)
+        )
+        self.layer1 = nn.Parameter(
+            torch.zeros((self.num_heads, self.hidden_dim, self.feature_dim), **kwargs)
+        )
+        nn.init.kaiming_uniform_(self.layer0)
+        nn.init.kaiming_uniform_(self.layer1)
+
+        self.bias0 = nn.Parameter(
+            torch.zeros((1, self.num_heads, 1, self.hidden_dim), **kwargs)
+        )
+        self.bias1 = nn.Parameter(
+            torch.zeros((1, self.num_heads, 1, self.feature_dim), **kwargs)
+        )
+        nn.init.kaiming_uniform_(self.bias0)
+        nn.init.kaiming_uniform_(self.bias1)
+
+    def zero_init_with_skip_(self):
+        with torch.no_grad():
+            nn.init.zeros_(self.layer0)
+            nn.init.zeros_(self.layer1)
+            nn.init.zeros_(self.bias0)
+            nn.init.zeros_(self.bias1)
+
+    def zero_init_(self):
+        raise NotImplementedError
+
+    def forward(self, x: torch.Tensor):
+        """
+        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
+        -> Down-project, apply nonlinearity, up-project; add skip connection
+        """
+        _x = torch.einsum("hde,bhld->bhle", self.layer0, x) + self.bias0
+        _x = F.relu(_x)
+        _x = torch.einsum("hef,bhle->bhlf", self.layer1, _x) + self.bias1
+        return x + _x if self.skip_connection else _x