feat: update readme with inference instructions

fix: lin_attn_mask in wrong dtype
fix: use existing model config
2025-02-06 21:29:36 +07:00 · 2025-02-06 15:25:33 +07:00 · 2025-02-06 00:12:14 +07:00 · 2025-02-05 23:46:59 +07:00 · 2025-02-05 19:40:11 +07:00 · 2025-02-05 19:35:15 +07:00
17 changed files with 407 additions and 639 deletions
--- a/src/axolotl/cli/convert_linear_attention.py
+++ b/src/axolotl/cli/convert_linear_attention.py
@@ -49,12 +49,9 @@ def do_linearize(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
    for p in model.parameters():
        p.requires_grad = False

-    # load config
-    base_config = load_model_config(cfg)
-
    # convert to linear llama
    linear_llama_config = LinearLlamaConfig.from_llama(
-        base_config, cfg.attention_config
+        model.config, cfg.attention_config
    )
    model = LinearLlamaForCausalLM.from_llama(
        model, config=linear_llama_config, train_attention=True
--- a/src/axolotl/integrations/lolcats/README.md
+++ b/src/axolotl/integrations/lolcats/README.md
@@ -4,7 +4,17 @@ https://github.com/HazyResearch/lolcats/

 ### Usage

-TODO: Add instruction to install `causal_dot_product`.
+Install `causal_dot_product` CUDA kernel (check the README in the `csrc` directory):
+
+```bash
+cd src/axolotl/integrations/lolcats/linear_llama/csrc
+
+# Edit `setup.py` to point to the correct CUDA capabilities L40-44
+# nano setup.py
+
+# Build the CUDA kernel
+python setup.py install
+```

 Step 1:

@@ -15,7 +25,9 @@ plugins:
 linearize: true
 ```

-Step 2: Remove the config above and finetune with lora with below possible targets.
+Run axolotl: `python -m axolotl.cli.convert_linear_attention config.yaml` TODO: change path CLI
+
+Step 2: Remove the config `linearize: true` and finetune with lora with below possible targets.

 ```yaml
 lora_target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
@@ -24,3 +36,9 @@ lora_target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
 # to allow this config to work with lora
 # unfrozen_parameters: ['.*feature_map_q.mlp.layer.*', '.*feature_map_k.mlp.layer.*', '.*window_factors.*']
 ```
+
+`axolotl train config.yaml --base-model={output_dir}/distilled --trust-remote-code --learning-rate=0.0001 # --wandb-project="..."`
+
+Step 3: Run inference on the finetuned model
+
+`axolotl inference config.yaml --lora-model-dir="{output_dir}" --trust-remote-code # --prompter="AlpacaPrompter"`
--- a/src/axolotl/integrations/lolcats/args.py
+++ b/src/axolotl/integrations/lolcats/args.py
@@ -44,4 +44,4 @@ class LinearAttentionArgs(BaseModel):

    attention_config: AttentionConfig

-    linearize: bool
+    linearize: Optional[bool] = False
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/init.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/init.py
@@ -1,21 +0,0 @@
-"""
-Linear and linear attention + sliding window classes
-"""
-from .linear_attention import LinearAttentionState, LolcatsLinearAttention
-from .linear_window_attention_sw import (
-    LinearAttentionSlidingWindowCache,
-    LolcatsSlidingWindowAttention,
-)
-from .linear_window_attention_sw_linear import LolcatsLinearSlidingWindowAttention
-from .linear_window_attention_sw_long import LolcatsSlidingWindowLongAttention
-from .linear_window_attention_tk import (
-    LinearAttentionTKWindowCache,
-    LolcatsTKWindowAttention,
-)
-from .linear_window_attention_tk_gen import (
-    LinearAttentionTKWindowGenerationCache,
-    LolcatsWindowAttentionTKGen,
-)
-
-# Experimental chunk linear attentions
-from .linear_window_attention_tk_long import LolcatsTKWindowLongAttention
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/utils.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/utils.py
@@ -1,34 +0,0 @@
-"""
-Shared attention helpers
-"""
-
-import torch
-
-
-# Copied from transformers.models.mistral.modeling_mistral (llama.modeling_llama at v4.36)
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
-    The hidden states go from:
-       (batch, num_key_value_heads, seqlen, head_dim) to
-       (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def mask_attention(
-    qk_dot: torch.Tensor, attn_mask: torch.Tensor, mask_value: float = -10000
-) -> torch.Tensor:
-    """
-    Apply attention mask (e.g., for padding)
-    """
-    if len(attn_mask.shape) == 4:  # attn_mask either (b, h, l, d) or (b, l)
-        return qk_dot.masked_fill(~attn_mask.bool(), mask_value)
-    else:
-        return qk_dot.masked_fill(~attn_mask[:, None, None, :].bool(), mask_value)
--- a/src/axolotl/integrations/lolcats/linear_llama/configuration_linear_llama.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/configuration_linear_llama.py
@@ -64,6 +64,13 @@ class LinearLlamaConfig(LlamaConfig):
    def __init__(self, attention_config: Optional[dict] = None, **kwargs):
        super().__init__(**kwargs)

+        # Set auto_map
+        self.auto_map = {
+            "AutoConfig": "configuration_linear_llama.LinearLlamaConfig",
+            "AutoModel": "modeling_linear_llama.LinearLlamaModel",
+            "AutoModelForCausalLM": "modeling_linear_llama.LinearLlamaForCausalLM",
+        }
+
        # Set default attention config if none provided
        self.attention_config = attention_config or {"attention_type": "softmax"}

--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_attention.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_attention.py
@@ -7,6 +7,7 @@ from typing import Any, List, Optional, Tuple

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers.cache_utils import Cache

 # Causal linear attention dot product CUDA kernel from fast-transformers
@@ -15,9 +16,7 @@ try:
 except ImportError:
    fast_causal_dot_product = None

-from ..model.feature_map import init_feature_map, init_learned_kernel
-from ..model.rotary import apply_rotary_pos_emb
-from .utils import repeat_kv
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv

 # -------------------
 # Attention functions
@@ -366,7 +365,7 @@ class LolcatsLinearAttention(nn.Module):
                        ..., None
                    ]  # b, 1, k_len, 1
                else:
-                    lin_attn_mask = attention_mask[:, None, :, None]  # b, 1, k_len, 1
+                    lin_attn_mask = attention_mask.bool()[:, None, :, None]  # b, 1, k_len, 1
                k = k.masked_fill(~lin_attn_mask, 0)

            if past_key_value is not None:  # Initialize states
@@ -523,3 +522,335 @@ class LinearAttentionState(Cache):
        raise NotImplementedError(
            "Reordering cache not implemented for LinearAttentionState"
        )
+
+
+# -------------------
+# feature map functions
+# -------------------
+
+
+def init_feature_map(name: str, mlp: nn.Module, **kwargs):
+    """
+    Initialize feature map final activation for linear attention
+    """
+    return FeatureMap(activation_name=name, mlp=mlp, **kwargs)
+
+
+def init_feature_map_act(name: str, fullspace: bool = True, **kwargs):
+    """
+    Initialize feature map final activation for linear attention
+    """
+    if name == "softmax_dim" and fullspace:
+        return SoftmaxDim(**kwargs)
+    elif name == "softmax_dim" and not fullspace:
+        return SoftmaxDimHalfspace(**kwargs)
+    elif name == "exp_dim" and fullspace:
+        return Exp(**kwargs)
+    elif name == "exp_dim" and not fullspace:
+        return ExpHalfspace(**kwargs)
+    elif name == "pos_elu":
+        return PosELU(**kwargs)
+    elif name == "relu":
+        return ReLU(**kwargs)
+
+    else:
+        raise NotImplementedError
+
+
+def init_learned_kernel(name: str, **kwargs):
+    """
+    Initialize feature map MLP for linear attention
+    """
+    if name == "untied_head_einsum":
+        return FeatureMapMLP(**kwargs)
+    elif name == "untied_head_adapter":
+        return FeatureMapAdapter(**kwargs)
+    else:
+        raise NotImplementedError
+
+
+class FeatureMap(nn.Module):
+    """
+    Final 'activation' of feature map. Can probably be combined with
+    `FeatureMapMLP` below
+
+    Full feature map is like f(xW + b)
+    -> This is the `f` part
+    """
+
+    def __init__(
+        self,
+        activation_name: str,
+        head_dim_idx: int = -1,
+        eps: float = 1e-12,
+        mlp: Optional[nn.Module] = None,
+        fullspace: bool = True,
+    ):
+        super().__init__()
+        self.head_dim_idx = head_dim_idx
+        self.eps = eps
+        self.mlp = mlp if mlp is not None else nn.Identity()
+        self.activation = init_feature_map_act(activation_name, fullspace, eps=eps)
+
+    def forward(self, x: torch.Tensor, *mlp_args, **mlp_kwargs):
+        """
+        Assume x.shape is (batch_size, n_heads, seq_len, head_dim)
+        """
+        return self.activation(self.mlp(x, *mlp_args, **mlp_kwargs), x)
+
+    def q_map(self, *args, **kwargs):
+        """
+        Use for inference in case q and k feature maps differ
+        """
+        return self.forward(*args, **kwargs)
+
+    def k_map(self, *args, **kwargs):
+        """
+        Use for inference in case q and k feature maps differ
+        """
+        return self.forward(*args, **kwargs)
+
+
+# -----------------------
+# Feature map activations
+# -----------------------
+class FeatureMapAct(nn.Module):
+    """
+    Base class for feature map activations
+    """
+
+    def __init__(self, eps: float = 1e-12):
+        super().__init__()
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        """
+        x.shape is (batch_size, n_heads, seq_len, head_dim)
+        """
+        return x
+
+
+class PosELU(FeatureMapAct):
+    """
+    1 + ELU activation as in https://arxiv.org/abs/2006.16236
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return (1 + F.elu(x)).clamp(min=self.eps)
+
+
+class ReLU(FeatureMapAct):
+    """
+    ReLU activation as in https://arxiv.org/abs/2103.13076
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return F.relu(x).clamp(min=self.eps)
+
+
+class SoftmaxDim(FeatureMapAct):
+    """
+    Softmax activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return torch.cat(
+            [torch.softmax(x, dim=-1), torch.softmax(-x, dim=-1)], dim=-1
+        ).clamp(min=self.eps)
+
+
+class SoftmaxDimHalfspace(FeatureMapAct):
+    """
+    Softmax activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        return torch.softmax(x, dim=-1).clamp(min=self.eps)
+
+
+class Exp(FeatureMapAct):
+    """
+    Exp activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        x_max = torch.amax(x, dim=-1, keepdim=True)
+        x_min = torch.amin(x, dim=-1, keepdim=True)
+        return torch.cat([torch.exp(x - x_max), torch.exp(-x + x_min)], dim=-1).clamp(
+            min=self.eps
+        )
+
+
+class ExpHalfspace(FeatureMapAct):
+    """
+    Exp activation as in https://arxiv.org/abs/2402.04347
+    """
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        x_max = torch.amax(x, dim=-1, keepdim=True)
+        return torch.exp(x - x_max).clamp(min=self.eps)
+
+
+# ----------------
+# Feature map MLPs
+# ----------------
+
+
+class FeatureMapMLP(nn.Module):
+    """
+    Learnable MLP in feature map.
+
+    Full feature map is like f(xW + b)
+    -> This is the `W` and (optional) `b` part
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,  # input dim
+        feature_dim: int,  # output dim
+        dtype: torch.dtype,
+        device: torch.device,
+        skip_connection: bool = False,
+        bias: bool = False,
+        zero_init: bool = False,
+        normal_init: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.feature_dim = feature_dim
+        self.dtype = dtype
+        self.device = device
+        self.skip_connection = skip_connection
+        self.bias = bias
+        self.zero_init = zero_init
+        self.normal_init = normal_init
+        self.init_weights_()
+
+        if self.zero_init:  # Zero-out weights or set as identity post-initialization
+            self.zero_init_with_skip_() if self.skip_connection else self.zero_init_()
+
+        if self.normal_init:
+            with torch.no_grad():
+                nn.init.normal_(self.layer)
+
+        if self.skip_connection:
+            assertion_fail = f"If self.skip_connection we need self.head_dim == self.feature_dim but self.head_dim is {self.head_dim} != self.feature_dim is {self.feature_dim}"
+            assert self.head_dim == self.feature_dim, assertion_fail
+
+    def init_weights_(self):
+        """
+        Initialize (W)eights and (b)iases
+        """
+        self.layer = nn.Parameter(
+            torch.zeros(
+                (self.num_heads, self.head_dim, self.feature_dim),
+                dtype=self.dtype,
+                device=self.device,
+            )
+        )
+        nn.init.kaiming_uniform_(self.layer)
+
+        if self.bias:
+            self.bias = nn.Parameter(
+                torch.zeros(
+                    (1, self.num_heads, 1, 1),  # self.feature_dim),
+                    dtype=self.dtype,
+                    device=self.device,
+                )
+            )
+            nn.init.kaiming_uniform_(self.bias)
+        else:
+            self.bias = 0.0  # hack
+
+    def zero_init_with_skip_(self):
+        """
+        Initialize weights to zero matrix if skip connection
+        """
+        with torch.no_grad():
+            nn.init.zeros_(self.layer)
+
+    def zero_init_(self):
+        """
+        Initialize weights to identity matrix if no skip connection
+        """
+        with torch.no_grad():
+            for i in range(self.layer.shape[0]):
+                try:
+                    nn.init.eye_(self.layer[i])
+                except RuntimeError:
+                    with torch.no_grad():
+                        dtype = self.layer[i].dtype
+                        weight = torch.eye(
+                            *self.layer[i].shape,
+                            requires_grad=self.layer[i].requires_grad,
+                            device=self.layer[i].device,
+                        )
+                        self.layer[i] = weight.to(dtype=dtype)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
+        """
+        _x = torch.einsum("hdf,bhld->bhlf", self.layer, x) + self.bias
+        return x + _x if self.skip_connection else _x
+
+
+class FeatureMapAdapter(FeatureMapMLP):
+    """
+    Learnable Feature map with bottleneck adapter
+    as in https://arxiv.org/abs/1902.00751
+
+    We don't use but could be fun to try
+    """
+
+    def __init__(self, hidden_dim: int, *args, **kwargs):
+        kwargs["skip_connection"] = True
+        kwargs["bias"] = True
+        kwargs["zero_init"] = True
+        self.hidden_dim = hidden_dim
+        super().__init__(*args, **kwargs)
+
+    def init_weights_(self):
+        """
+        Initialize (W)eights and (b)iases
+        """
+        kwargs = {"dtype": self.dtype, "device": self.device}
+        self.layer0 = nn.Parameter(
+            torch.zeros((self.num_heads, self.head_dim, self.hidden_dim), **kwargs)
+        )
+        self.layer1 = nn.Parameter(
+            torch.zeros((self.num_heads, self.hidden_dim, self.feature_dim), **kwargs)
+        )
+        nn.init.kaiming_uniform_(self.layer0)
+        nn.init.kaiming_uniform_(self.layer1)
+
+        self.bias0 = nn.Parameter(
+            torch.zeros((1, self.num_heads, 1, self.hidden_dim), **kwargs)
+        )
+        self.bias1 = nn.Parameter(
+            torch.zeros((1, self.num_heads, 1, self.feature_dim), **kwargs)
+        )
+        nn.init.kaiming_uniform_(self.bias0)
+        nn.init.kaiming_uniform_(self.bias1)
+
+    def zero_init_with_skip_(self):
+        with torch.no_grad():
+            nn.init.zeros_(self.layer0)
+            nn.init.zeros_(self.layer1)
+            nn.init.zeros_(self.bias0)
+            nn.init.zeros_(self.bias1)
+
+    def zero_init_(self):
+        raise NotImplementedError
+
+    def forward(self, x: torch.Tensor):
+        """
+        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
+        -> Down-project, apply nonlinearity, up-project; add skip connection
+        """
+        _x = torch.einsum("hde,bhld->bhle", self.layer0, x) + self.bias0
+        _x = F.relu(_x)
+        _x = torch.einsum("hef,bhle->bhlf", self.layer1, _x) + self.bias1
+        return x + _x if self.skip_connection else _x
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_sw.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_sw.py
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_sw_linear.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_sw_linear.py
@@ -23,7 +23,7 @@ try:
 except ModuleNotFoundError:
    _flash_attention_forward = None  # Transformers v4.36

-from ..model.rotary import apply_rotary_pos_emb
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb

 # Causal linear attention dot product CUDA kernel from fast-transformers
 from .linear_attention import (
@@ -32,9 +32,7 @@ from .linear_attention import (
    causal_dot_product,
 )

-LOG = logging.getLogger(
-    "axolotl.integrations.lolcats.linear_attention.linear_window_attention_sw_long"
-)
+LOG = logging.getLogger(__name__)


 # ----------------------
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_sw_long.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_sw_long.py
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_tk.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_tk.py
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_tk_gen.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_tk_gen.py
@@ -11,9 +11,7 @@ import torch.nn.functional as F
 from .linear_attention import LinearAttentionState
 from .linear_window_attention_tk_long import LolcatsTKWindowLongAttention

-LOG = logging.getLogger(
-    "axolotl.integrations.lolcats.linear_attention.linear_attention_tk_gen"
-)
+LOG = logging.getLogger(__name__)

 try:
    from thunderkittens import hedgehog as tk_window_hedgehog_attention
--- a/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_tk_long.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/attention/linear_window_attention_tk_long.py
@@ -22,7 +22,8 @@ try:
 except ModuleNotFoundError:
    _flash_attention_forward = None  # Transformers v4.36

-from ..model.rotary import apply_rotary_pos_emb
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
 from .linear_attention import softmax_attention
 from .linear_window_attention_tk import LolcatsTKWindowAttention

--- a/src/axolotl/integrations/lolcats/linear_llama/model/init.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/model/init.py
--- a/src/axolotl/integrations/lolcats/linear_llama/model/feature_map.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/model/feature_map.py
@@ -1,336 +0,0 @@
-"""
-Learnable linear attention feature map classes and functions
-"""
-
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def init_feature_map(name: str, mlp: nn.Module, **kwargs):
-    """
-    Initialize feature map final activation for linear attention
-    """
-    return FeatureMap(activation_name=name, mlp=mlp, **kwargs)
-
-
-def init_feature_map_act(name: str, fullspace: bool = True, **kwargs):
-    """
-    Initialize feature map final activation for linear attention
-    """
-    if name == "softmax_dim" and fullspace:
-        return SoftmaxDim(**kwargs)
-    elif name == "softmax_dim" and not fullspace:
-        return SoftmaxDimHalfspace(**kwargs)
-    elif name == "exp_dim" and fullspace:
-        return Exp(**kwargs)
-    elif name == "exp_dim" and not fullspace:
-        return ExpHalfspace(**kwargs)
-    elif name == "pos_elu":
-        return PosELU(**kwargs)
-    elif name == "relu":
-        return ReLU(**kwargs)
-
-    else:
-        raise NotImplementedError
-
-
-def init_learned_kernel(name: str, **kwargs):
-    """
-    Initialize feature map MLP for linear attention
-    """
-    if name == "untied_head_einsum":
-        return FeatureMapMLP(**kwargs)
-    elif name == "untied_head_adapter":
-        return FeatureMapAdapter(**kwargs)
-    else:
-        raise NotImplementedError
-
-
-class FeatureMap(nn.Module):
-    """
-    Final 'activation' of feature map. Can probably be combined with
-    `FeatureMapMLP` below
-
-    Full feature map is like f(xW + b)
-    -> This is the `f` part
-    """
-
-    def __init__(
-        self,
-        activation_name: str,
-        head_dim_idx: int = -1,
-        eps: float = 1e-12,
-        mlp: Optional[nn.Module] = None,
-        fullspace: bool = True,
-    ):
-        super().__init__()
-        self.head_dim_idx = head_dim_idx
-        self.eps = eps
-        self.mlp = mlp if mlp is not None else nn.Identity()
-        self.activation = init_feature_map_act(activation_name, fullspace, eps=eps)
-
-    def forward(self, x: torch.Tensor, *mlp_args, **mlp_kwargs):
-        """
-        Assume x.shape is (batch_size, n_heads, seq_len, head_dim)
-        """
-        return self.activation(self.mlp(x, *mlp_args, **mlp_kwargs), x)
-
-    def q_map(self, *args, **kwargs):
-        """
-        Use for inference in case q and k feature maps differ
-        """
-        return self.forward(*args, **kwargs)
-
-    def k_map(self, *args, **kwargs):
-        """
-        Use for inference in case q and k feature maps differ
-        """
-        return self.forward(*args, **kwargs)
-
-
-# -----------------------
-# Feature map activations
-# -----------------------
-class FeatureMapAct(nn.Module):
-    """
-    Base class for feature map activations
-    """
-
-    def __init__(self, eps: float = 1e-12):
-        super().__init__()
-        self.eps = eps
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        """
-        x.shape is (batch_size, n_heads, seq_len, head_dim)
-        """
-        return x
-
-
-class PosELU(FeatureMapAct):
-    """
-    1 + ELU activation as in https://arxiv.org/abs/2006.16236
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return (1 + F.elu(x)).clamp(min=self.eps)
-
-
-class ReLU(FeatureMapAct):
-    """
-    ReLU activation as in https://arxiv.org/abs/2103.13076
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return F.relu(x).clamp(min=self.eps)
-
-
-class SoftmaxDim(FeatureMapAct):
-    """
-    Softmax activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return torch.cat(
-            [torch.softmax(x, dim=-1), torch.softmax(-x, dim=-1)], dim=-1
-        ).clamp(min=self.eps)
-
-
-class SoftmaxDimHalfspace(FeatureMapAct):
-    """
-    Softmax activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        return torch.softmax(x, dim=-1).clamp(min=self.eps)
-
-
-class Exp(FeatureMapAct):
-    """
-    Exp activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        x_max = torch.amax(x, dim=-1, keepdim=True)
-        x_min = torch.amin(x, dim=-1, keepdim=True)
-        return torch.cat([torch.exp(x - x_max), torch.exp(-x + x_min)], dim=-1).clamp(
-            min=self.eps
-        )
-
-
-class ExpHalfspace(FeatureMapAct):
-    """
-    Exp activation as in https://arxiv.org/abs/2402.04347
-    """
-
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        x_max = torch.amax(x, dim=-1, keepdim=True)
-        return torch.exp(x - x_max).clamp(min=self.eps)
-
-
-# ----------------
-# Feature map MLPs
-# ----------------
-
-
-class FeatureMapMLP(nn.Module):
-    """
-    Learnable MLP in feature map.
-
-    Full feature map is like f(xW + b)
-    -> This is the `W` and (optional) `b` part
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_dim: int,  # input dim
-        feature_dim: int,  # output dim
-        dtype: torch.dtype,
-        device: torch.device,
-        skip_connection: bool = False,
-        bias: bool = False,
-        zero_init: bool = False,
-        normal_init: bool = False,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.feature_dim = feature_dim
-        self.dtype = dtype
-        self.device = device
-        self.skip_connection = skip_connection
-        self.bias = bias
-        self.zero_init = zero_init
-        self.normal_init = normal_init
-        self.init_weights_()
-
-        if self.zero_init:  # Zero-out weights or set as identity post-initialization
-            self.zero_init_with_skip_() if self.skip_connection else self.zero_init_()
-
-        if self.normal_init:
-            with torch.no_grad():
-                nn.init.normal_(self.layer)
-
-        if self.skip_connection:
-            assertion_fail = f"If self.skip_connection we need self.head_dim == self.feature_dim but self.head_dim is {self.head_dim} != self.feature_dim is {self.feature_dim}"
-            assert self.head_dim == self.feature_dim, assertion_fail
-
-    def init_weights_(self):
-        """
-        Initialize (W)eights and (b)iases
-        """
-        self.layer = nn.Parameter(
-            torch.zeros(
-                (self.num_heads, self.head_dim, self.feature_dim),
-                dtype=self.dtype,
-                device=self.device,
-            )
-        )
-        nn.init.kaiming_uniform_(self.layer)
-
-        if self.bias:
-            self.bias = nn.Parameter(
-                torch.zeros(
-                    (1, self.num_heads, 1, 1),  # self.feature_dim),
-                    dtype=self.dtype,
-                    device=self.device,
-                )
-            )
-            nn.init.kaiming_uniform_(self.bias)
-        else:
-            self.bias = 0.0  # hack
-
-    def zero_init_with_skip_(self):
-        """
-        Initialize weights to zero matrix if skip connection
-        """
-        with torch.no_grad():
-            nn.init.zeros_(self.layer)
-
-    def zero_init_(self):
-        """
-        Initialize weights to identity matrix if no skip connection
-        """
-        with torch.no_grad():
-            for i in range(self.layer.shape[0]):
-                try:
-                    nn.init.eye_(self.layer[i])
-                except RuntimeError:
-                    with torch.no_grad():
-                        dtype = self.layer[i].dtype
-                        weight = torch.eye(
-                            *self.layer[i].shape,
-                            requires_grad=self.layer[i].requires_grad,
-                            device=self.layer[i].device,
-                        )
-                        self.layer[i] = weight.to(dtype=dtype)
-
-    def forward(self, x: torch.Tensor):
-        """
-        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
-        """
-        _x = torch.einsum("hdf,bhld->bhlf", self.layer, x) + self.bias
-        return x + _x if self.skip_connection else _x
-
-
-class FeatureMapAdapter(FeatureMapMLP):
-    """
-    Learnable Feature map with bottleneck adapter
-    as in https://arxiv.org/abs/1902.00751
-
-    We don't use but could be fun to try
-    """
-
-    def __init__(self, hidden_dim: int, *args, **kwargs):
-        kwargs["skip_connection"] = True
-        kwargs["bias"] = True
-        kwargs["zero_init"] = True
-        self.hidden_dim = hidden_dim
-        super().__init__(*args, **kwargs)
-
-    def init_weights_(self):
-        """
-        Initialize (W)eights and (b)iases
-        """
-        kwargs = {"dtype": self.dtype, "device": self.device}
-        self.layer0 = nn.Parameter(
-            torch.zeros((self.num_heads, self.head_dim, self.hidden_dim), **kwargs)
-        )
-        self.layer1 = nn.Parameter(
-            torch.zeros((self.num_heads, self.hidden_dim, self.feature_dim), **kwargs)
-        )
-        nn.init.kaiming_uniform_(self.layer0)
-        nn.init.kaiming_uniform_(self.layer1)
-
-        self.bias0 = nn.Parameter(
-            torch.zeros((1, self.num_heads, 1, self.hidden_dim), **kwargs)
-        )
-        self.bias1 = nn.Parameter(
-            torch.zeros((1, self.num_heads, 1, self.feature_dim), **kwargs)
-        )
-        nn.init.kaiming_uniform_(self.bias0)
-        nn.init.kaiming_uniform_(self.bias1)
-
-    def zero_init_with_skip_(self):
-        with torch.no_grad():
-            nn.init.zeros_(self.layer0)
-            nn.init.zeros_(self.layer1)
-            nn.init.zeros_(self.bias0)
-            nn.init.zeros_(self.bias1)
-
-    def zero_init_(self):
-        raise NotImplementedError
-
-    def forward(self, x: torch.Tensor):
-        """
-        Assume x.shape is (batch_size, num_heads, seq_len, head_dim)
-        -> Down-project, apply nonlinearity, up-project; add skip connection
-        """
-        _x = torch.einsum("hde,bhld->bhle", self.layer0, x) + self.bias0
-        _x = F.relu(_x)
-        _x = torch.einsum("hef,bhle->bhlf", self.layer1, _x) + self.bias1
-        return x + _x if self.skip_connection else _x
--- a/src/axolotl/integrations/lolcats/linear_llama/model/rotary.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/model/rotary.py
@@ -1,204 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Rotary embeddings. Same as usual for Transformer models.
-
-Note these are modified from HF Transformers v4.36, from:
- transformers/models/llama/modeling_llama.py or transformers/models/mistral/modeling_mistral.py
- i.e., https://github.com/huggingface/transformers/blob/a7cab3c283312b8d4de5df3bbe719971e24f4281/src/transformers/models/llama/modeling_llama.py#L123
-"""
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-
-def get_rotary_embeddings(
-    rope_scaling_type: Optional[str] = None,
-    head_dim: int = 128,
-    max_position_embeddings: int = 4096,
-    rope_theta: float = 10000.0,
-    rope_scaling_factor: float = 1.0,
-    device: Optional[torch.device] = None,
-) -> nn.Module:
-    """Return rotary embedding object"""
-    if rope_scaling_type is None:
-        return RotaryEmbedding(
-            head_dim,
-            max_position_embeddings=max_position_embeddings,
-            base=rope_theta,
-            device=device,
-        )
-    elif rope_scaling_type == "linear":
-        return LinearScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=max_position_embeddings,
-            scaling_factor=rope_scaling_factor,
-            base=rope_theta,
-            device=device,
-        )
-    elif rope_scaling_type == "dynamic":
-        return DynamicNTKScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=max_position_embeddings,
-            scaling_factor=rope_scaling_factor,
-            base=rope_theta,
-            device=device,
-        )
-    else:
-        raise NotImplementedError(
-            f'Sorry rope_scaling_type == "{rope_scaling_type}" not implemented.'
-        )
-
-
-# Copied from transformers.models.mistral.modeling_mistral (llama.modeling_llama at v4.36)
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.mistral.modeling_mistral (llama.modeling_llama at v4.36)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors."""
-    if position_ids is not None:
-        cos, sin = cos[position_ids], sin[position_ids]
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Modified from transformers.models.mistral.modeling_mistral (llama.modeling_llama at v4.36)
-class RotaryEmbedding(nn.Module):
-    """Original Rotary Embeddings from RoFormer https://arxiv.org/abs/2104.09864"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (
-            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
-        )
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings,
-            device=self.inv_freq.device,
-            dtype=torch.get_default_dtype(),
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
-        )
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """
-        Compute rotary embeddings
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers/models/llama/modeling_llama.py at v4.36
-class LinearScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(
-        self,
-        dim,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-    ):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
-        )
-        t = t / self.scaling_factor
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-# Copied from transformers/models/llama/modeling_llama.py at v4.36
-class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(
-        self,
-        dim,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-    ):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings)
-                - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (
-                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
-        )
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
--- a/src/axolotl/integrations/lolcats/linear_llama/modeling_linear_llama.py
+++ b/src/axolotl/integrations/lolcats/linear_llama/modeling_linear_llama.py
@@ -11,7 +11,7 @@

 import logging
 from functools import partial
-from typing import Any
+from typing import Any, Optional

 from torch import nn
 from tqdm import tqdm
@@ -23,7 +23,6 @@ from transformers.models.llama.modeling_llama import (
    LlamaRotaryEmbedding,
 )

-from .attention import LolcatsLinearAttention
 from .configuration_linear_llama import LinearLlamaConfig

 LOG = logging.getLogger(__name__)
@@ -36,11 +35,10 @@ class LinearLlamaDecoderLayer(LlamaDecoderLayer):

    def __init__(self, config: LinearLlamaConfig, layer_idx: int):
        super().__init__(config, layer_idx)
+
        # Replace the attention layer with our custom attention
-        self.self_attn = LolcatsLinearAttention(
-            base_attn=self.self_attn,  # type: ignore
-            layer_idx=layer_idx,
-            **config.attention_config,
+        self.self_attn = convert_llama_attention(
+            layer=self, attention_config=config.attention_config
        )


@@ -110,18 +108,22 @@ class LinearLlamaForCausalLM(LlamaForCausalLM):
        if config is None:
            raise ValueError("Missing config")

-        # initialize the model with prior weights
+        # initialize a new model with config
        new_model = cls(config=config)

-        del new_model.model  # remove the default model
+        # remove the default model and lm_head
+        del new_model.model
+        del new_model.lm_head
+
+        # load converted model, lm_head, and vocab_size from llama model
        new_model.model = convert_attention(
            model.model,
            attention_config=config.attention_config,
            train_attention=train_attention,
            remove_base_attn=remove_base_attn,
        )
-
-        new_model.lm_head.load_state_dict(model.lm_head.state_dict())
+        new_model.lm_head = model.lm_head
+        new_model.vocab_size = model.vocab_size

        return new_model

@@ -227,7 +229,7 @@ def traverse_layers(model: nn.Module, verbose: bool = False):
 def convert_llama_attention(
    layer: nn.Module,
    attention_config: dict,
-    layers: list[nn.Module],  # list of layers
+    layers: Optional[list[nn.Module]] = None,  # list of layers
    train_attention: bool = False,
    remove_base_attn: bool = True,
 ):
@@ -237,7 +239,7 @@ def convert_llama_attention(
    return get_attention(**attention_config)(
        base_attn=layer.self_attn,
        layer_idx=layer.self_attn.layer_idx,  # Transformers v4.36
-        max_layer_idx=len(layers) - 1,
+        max_layer_idx=len(layers) - 1 if layers else None,
        train_attention=train_attention,
        remove_base_attn=remove_base_attn,
    )
@@ -252,39 +254,41 @@ def get_attention(attention_type: str, **kwargs):
    kwargs["attention_type"] = attention_type

    if attention_type == "lolcats_llama":
-        from .attention import LolcatsLinearAttention
+        from .linear_attention import LolcatsLinearAttention

        return partial(LolcatsLinearAttention, **kwargs)

    elif attention_type == "lolcats_llama_window_tk":
-        from .attention import LolcatsTKWindowAttention
+        from .linear_window_attention_tk import LolcatsTKWindowAttention

        return partial(LolcatsTKWindowAttention, **kwargs)

    elif attention_type == "lolcats_llama_window_sw":
-        from .attention import LolcatsSlidingWindowAttention
+        from .linear_window_attention_sw import LolcatsSlidingWindowAttention

        return partial(LolcatsSlidingWindowAttention, **kwargs)

    elif attention_type == "lolcats_llama_window_sw_linear":
-        from .attention import LolcatsLinearSlidingWindowAttention
+        from .linear_window_attention_sw_linear import (
+            LolcatsLinearSlidingWindowAttention,
+        )

        return partial(LolcatsLinearSlidingWindowAttention, **kwargs)

    # Experimental chunked linear attentions below
    elif attention_type == "lolcats_long_llama_window_tk":
-        from .attention import LolcatsTKWindowLongAttention
+        from .linear_window_attention_tk_long import LolcatsTKWindowLongAttention

        return partial(LolcatsTKWindowLongAttention, **kwargs)

    elif attention_type == "lolcats_long_llama_window_sw":
-        from .attention import LolcatsSlidingWindowLongAttention
+        from .linear_window_attention_sw_long import LolcatsSlidingWindowLongAttention

        return partial(LolcatsSlidingWindowLongAttention, **kwargs)

    # TK generation build (requires Thunderkittens)
    elif attention_type == "lolcats_llama_window_tk_gen":
-        from .attention import LolcatsWindowAttentionTKGen
+        from .linear_window_attention_tk_gen import LolcatsWindowAttentionTKGen

        return partial(LolcatsWindowAttentionTKGen, **kwargs)

@@ -302,28 +306,32 @@ def get_attention_cache(attention_type: str, past_key_values: Any = None):

    # LOG.info(f'Returning attention cache based on attention_type == {attention_type}')
    elif "lolcats_llama_window_tk_gen" in attention_type:
-        from .attention import LinearAttentionTKWindowGenerationCache
+        from .linear_window_attention_tk_gen import (
+            LinearAttentionTKWindowGenerationCache,
+        )

        return LinearAttentionTKWindowGenerationCache()

    elif "llama_window_tk" in attention_type:
-        from .attention import LinearAttentionTKWindowCache
+        from .linear_window_attention_tk import LinearAttentionTKWindowCache

        return LinearAttentionTKWindowCache()

    elif "llama_window_sw" in attention_type:
-        from .attention import LinearAttentionSlidingWindowCache
+        from .linear_window_attention_sw import LinearAttentionSlidingWindowCache

        return LinearAttentionSlidingWindowCache()

    elif "llama_window_sw_linear" in attention_type:
-        from .attention import LinearAttentionSlidingWindowCache
+        from .linear_window_attention_sw import LinearAttentionSlidingWindowCache

        return LinearAttentionSlidingWindowCache()

    # TK generation build (requires Thunderkittens)
    elif attention_type == "lolcats_llama_window_tk_gen":
-        from .attention import LinearAttentionTKWindowGenerationCache
+        from .linear_window_attention_tk_gen import (
+            LinearAttentionTKWindowGenerationCache,
+        )

        return LinearAttentionTKWindowGenerationCache()

@@ -331,7 +339,7 @@ def get_attention_cache(attention_type: str, past_key_values: Any = None):
        return past_key_values

    else:
-        from .attention import LinearAttentionState
+        from .linear_attention import LinearAttentionState

        return LinearAttentionState()

@@ -346,3 +354,8 @@ def register_linear_llama():
    AutoConfig.register("linear_llama", LinearLlamaConfig)
    AutoModel.register(LinearLlamaConfig, LinearLlamaModel)
    AutoModelForCausalLM.register(LinearLlamaConfig, LinearLlamaForCausalLM)
+
+    # registering for auto classes to save files
+    LinearLlamaConfig.register_for_auto_class("AutoConfig")
+    LinearLlamaModel.register_for_auto_class("AutoModel")
+    LinearLlamaForCausalLM.register_for_auto_class("AutoModelForCausalLM")
Author	SHA1	Message	Date
NanoCode012	13d458d0ae	feat: update readme with inference instructions	2025-02-06 21:29:36 +07:00
NanoCode012	ebd406af1d	fix: lin_attn_mask in wrong dtype	2025-02-06 15:25:33 +07:00
NanoCode012	caa49a9d7d	fix: use existing model config	2025-02-06 00:12:14 +07:00
NanoCode012	c15ea6b956	fix: load vocab_size	2025-02-05 23:46:59 +07:00
NanoCode012	578fa764c8	chore: moved feature map into linear attention	2025-02-05 19:40:11 +07:00
NanoCode012	0e6efaa10c	fix: manually set auto-map	2025-02-05 19:35:15 +07:00
NanoCode012	c4cb622590	fix: remove redundant files	2025-02-05 19:34:06 +07:00
NanoCode012	0f82bd2d18	chore: improve instruction and made linearize optional	2025-02-05 19:33:15 +07:00
NanoCode012	49746b184f	chore: flatten directory structure and register to autoclass to save	2025-02-05 19:17:57 +07:00
NanoCode012	9e1c4de13c	fix: assign linear head instead of loading state dict	2025-02-05 18:24:31 +07:00