proof of concept for sage attention

2024-11-22 14:47:19 -05:00
6 changed files with 1143 additions and 0 deletions
--- a/src/axolotl/integrations/sageattention/init.py
+++ b/src/axolotl/integrations/sageattention/init.py
--- a/src/axolotl/integrations/sageattention/lib/core.py
+++ b/src/axolotl/integrations/sageattention/lib/core.py
@@ -0,0 +1,361 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Any, Optional
+
+import torch
+from torch.autograd import Function
+
+from .triton.attn_qk_int8_per_block_causal_varlen import (
+    backward as sageattn_varlen_backward,
+)
+from .triton.attn_qk_int8_per_block_causal_varlen import forward as attn_true_varlen
+from .triton.quant_per_block_varlen import (
+    per_block_int8 as per_block_int8_varlen_triton,
+)
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn_varlen(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    sm_scale: Optional[float] = None,
+    smooth_k: bool = True,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor, shape: ``[cu_seqlens_q[-1], num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor, shape: ``[cu_seqlens_k[-1], num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor, shape: ``[cu_seqlens_k[-1], num_kv_heads, head_dim]``.
+
+    cu_seqlens_q : torch.Tensor
+        The cumulative sequence lengths for the query sequences in the batch, used to index into `q`.
+        Shape: ``[batch_size + 1]``, where each entry represents the cumulative length of sequences up to that batch index.
+
+    cu_seqlens_k : torch.Tensor
+        The cumulative sequence lengths for the key and value sequences in the batch, used to index into `k` and `v`.
+        Shape: ``[batch_size + 1]``, where each entry represents the cumulative length of sequences up to that batch index.
+
+    max_seqlen_q : int
+        The maximum sequence length for the query tensor in the batch.
+
+    max_seqlen_k : int
+        The maximum sequence length for the key and value tensors in the batch.
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len for each sequence.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor, shape: ``[cu_seqlens_q[-1], num_qo_heads, head_dim]``.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16``, ``torch.bfloat16`` or ``torch.float32``.
+    - The tensors `cu_seqlens_q` and `cu_seqlens_k` must have the dtype ``torch.int32`` or ``torch.int64``.
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ], "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    head_dim = q.size(-1)
+    assert head_dim in [64, 128], "varlen only support head_dim [64, 128]."
+
+    assert (
+        q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
+    ), "Last dim of qkv must be contiguous."
+    assert (
+        cu_seqlens_q.is_contiguous() and cu_seqlens_k.is_contiguous()
+    ), "cu_seqlens_q and cu_seqlens_k must be contiguous."
+
+    if dtype == torch.bfloat16 or dtype == torch.float32:
+        v = v.to(torch.float16)
+
+    if smooth_k:
+        km = k.mean(
+            dim=0, keepdim=True
+        )  # ! km is calculated on the all the batches. Calculate over each individual sequence requires dedicated kernel.
+        k -= km
+
+    (
+        q_int8,
+        q_scale,
+        k_int8,
+        k_scale,
+        cu_seqlens_q_scale,
+        cu_seqlens_k_scale,
+    ) = per_block_int8_varlen_triton(
+        q, k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, sm_scale=sm_scale
+    )
+
+    o = attn_true_varlen(
+        q_int8,
+        k_int8,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        q_scale,
+        k_scale,
+        cu_seqlens_q_scale,
+        cu_seqlens_k_scale,
+        output_dtype=dtype,
+    )
+
+    return o
+
+
+class SageAttentionFunction(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=None,
+    ):
+        """
+        query: Tensor of shape [batch_size, num_heads, seq_len_q, head_dim]
+        key: Tensor of shape [batch_size, num_heads, seq_len_k, head_dim]
+        value: Tensor of shape [batch_size, num_heads, seq_len_k, head_dim]
+        attn_mask: Optional[Tensor], mask tensor
+        dropout_p: float, dropout probability
+        is_causal: bool, whether to apply causal masking
+        scale: Optional[float], scaling factor for attention scores
+        """
+        # Ensure inputs are contiguous
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        # Handle default scale
+        if scale is None:
+            scale = 1.0 / (query.size(-1) ** 0.5)
+
+        # Save parameters needed for backward
+        ctx.scale = scale
+        ctx.is_causal = is_causal
+        ctx.dropout_p = dropout_p
+        ctx.attn_mask = attn_mask
+
+        # Prepare cumulative sequence lengths and max sequence lengths
+        # Assuming batch sizes are consistent across query, key, and value
+        batch_size, num_heads, seq_len_q, head_dim = query.shape
+        seq_len_k = key.shape[2]
+
+        # Flatten batch and head dimensions
+        q = query.view(
+            -1, seq_len_q, head_dim
+        )  # [batch_size * num_heads, seq_len_q, head_dim]
+        k = key.view(-1, seq_len_k, head_dim)
+        v = value.view(-1, seq_len_k, head_dim)
+
+        # Create cumulative sequence lengths
+        cu_seqlens_q = torch.arange(
+            0,
+            (batch_size * num_heads + 1) * seq_len_q,
+            seq_len_q,
+            dtype=torch.int32,
+            device=query.device,
+        )
+        cu_seqlens_k = torch.arange(
+            0,
+            (batch_size * num_heads + 1) * seq_len_k,
+            seq_len_k,
+            dtype=torch.int32,
+            device=key.device,
+        )
+        max_seqlen_q = seq_len_q
+        max_seqlen_k = seq_len_k
+
+        # Call your custom per-block int8 quantization function
+        (
+            q_int8,
+            q_scale,
+            k_int8,
+            k_scale,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+        ) = per_block_int8_varlen_triton(
+            q, k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, sm_scale=scale
+        )
+
+        # Call your custom attention function
+        if is_causal:
+            output = attn_true_varlen(
+                q_int8,
+                k_int8,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                q_scale,
+                k_scale,
+                cu_seqlens_q_scale,
+                cu_seqlens_k_scale,
+                output_dtype=query.dtype,
+            )
+        else:
+            raise NotImplementedError("Non-causal attention is not implemented yet.")
+
+        # Reshape output to match the expected shape
+        output = output.view(batch_size, num_heads, seq_len_q, head_dim)
+
+        # Save tensors for backward
+        ctx.save_for_backward(
+            query,
+            key,
+            value,
+            q_int8,
+            k_int8,
+            q_scale,
+            k_scale,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+            output,
+        )
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (
+            query,
+            key,
+            value,
+            q_int8,
+            k_int8,
+            q_scale,
+            k_scale,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+            output,
+        ) = ctx.saved_tensors
+
+        scale = ctx.scale
+        is_causal = ctx.is_causal
+        dropout_p = ctx.dropout_p
+        attn_mask = ctx.attn_mask
+
+        # Flatten batch and head dimensions
+        batch_size, num_heads, seq_len_q, head_dim = query.shape
+        seq_len_k = key.shape[2]
+        grad_output = grad_output.contiguous()
+        do = grad_output.view(-1, seq_len_q, head_dim)
+
+        # Compute gradients w.r.t. q, k, v
+        dq, dk, dv = sageattn_varlen_backward(
+            do,
+            query.view(-1, seq_len_q, head_dim),
+            key.view(-1, seq_len_k, head_dim),
+            value.view(-1, seq_len_k, head_dim),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seq_len_q,
+            seq_len_k,
+            q_int8,
+            k_int8,
+            q_scale,
+            k_scale,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+            scale,
+            is_causal,
+        )
+
+        # Reshape gradients to match the input shapes
+        dq = dq.view(batch_size, num_heads, seq_len_q, head_dim)
+        dk = dk.view(batch_size, num_heads, seq_len_k, head_dim)
+        dv = dv.view(batch_size, num_heads, seq_len_k, head_dim)
+
+        # Handle optional arguments
+        d_attn_mask = None  # Assuming attn_mask does not require gradients
+        d_dropout_p = (
+            None  # Dropout probability is a hyperparameter, typically not optimized
+        )
+        d_is_causal = None  # Not differentiable
+        d_scale = None  # If scale is a tensor and requires grad, compute its gradient
+
+        return dq, dk, dv, d_attn_mask, d_dropout_p, d_is_causal, d_scale
+
+
+def scaled_dot_product_attention(
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    """
+    Custom scaled dot product attention using SageAttentionFunction.
+    """
+    return SageAttentionFunction.apply(
+        query, key, value, attn_mask, dropout_p, is_causal, scale
+    )
+
+
+def monkeypatch_sdp_w_sage_attention():
+    """
+    Replace torch.nn.functional.scaled_dot_product_attention with custom scaled dot product attention using SageAttentionFunction.
+    """
+    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
--- a/src/axolotl/integrations/sageattention/lib/triton/init.py
+++ b/src/axolotl/integrations/sageattention/lib/triton/init.py
--- a/src/axolotl/integrations/sageattention/lib/triton/attn_qk_int8_per_block_causal_varlen.py
+++ b/src/axolotl/integrations/sageattention/lib/triton/attn_qk_int8_per_block_causal_varlen.py
@@ -0,0 +1,622 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _attn_fwd_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    q_scale,
+    kv_len,
+    K_ptrs,
+    K_scale_ptr,
+    V_ptrs,
+    stride_kn,
+    stride_vn,
+    start_m,
+    H: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    STAGE: tl.constexpr,
+    offs_m: tl.constexpr,
+    offs_n: tl.constexpr,
+):
+    if STAGE == 1:
+        lo, hi = 0, start_m * BLOCK_M
+    elif STAGE == 2:
+        lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M
+        lo = tl.multiple_of(lo, BLOCK_M)
+        K_scale_ptr += (lo // BLOCK_N) * H
+        K_ptrs += stride_kn * lo
+        V_ptrs += stride_vn * lo
+    for start_n in range(lo, hi, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        k_mask = offs_n[None, :] < (kv_len - start_n)
+        k = tl.load(K_ptrs, mask=k_mask)
+        k_scale = tl.load(K_scale_ptr)
+        qk = tl.dot(q, k).to(tl.float32) * q_scale * k_scale
+
+        if STAGE == 2:
+            mask = offs_m[:, None] >= (start_n + offs_n[None, :])
+            qk = qk + tl.where(mask, 0, -1.0e6)
+            m_ij = tl.maximum(m_i, tl.max(qk, 1))
+            qk -= m_ij[:, None]
+        else:
+            m_ij = tl.maximum(m_i, tl.max(qk, 1))
+            qk = qk - m_ij[:, None]
+
+        p = tl.math.exp2(qk)
+        l_ij = tl.sum(p, 1)
+
+        alpha = tl.math.exp2(m_i - m_ij)
+        l_i = l_i * alpha + l_ij
+
+        acc = acc * alpha[:, None]
+
+        v = tl.load(V_ptrs, mask=offs_n[:, None] < (kv_len - start_n))
+        p = p.to(tl.float16)
+
+        acc += tl.dot(p, v, out_dtype=tl.float16)
+        m_i = m_ij
+        K_ptrs += BLOCK_N * stride_kn
+        K_scale_ptr += H
+        V_ptrs += BLOCK_N * stride_vn
+    return acc, l_i, m_i
+
+
+@triton.jit
+def _attn_fwd(
+    Q,
+    K,
+    V,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    Q_scale,
+    K_scale,
+    cu_seqlens_q_scale,
+    cu_seqlens_k_scale,
+    Out,
+    stride_qh,
+    stride_qn,
+    stride_kh,
+    stride_kn,
+    stride_vh,
+    stride_vn,
+    stride_oh,
+    stride_on,
+    H: tl.constexpr,
+    num_kv_groups: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    STAGE: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+
+    off_z = tl.program_id(2).to(tl.int64)
+    off_h = tl.program_id(1).to(tl.int64)
+
+    cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+    cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+
+    qo_len = cu_seqlens_q_end - cu_seqlens_q_start
+
+    if (start_m * BLOCK_M) >= qo_len:
+        return
+
+    cu_seq_lens_q_scale_start = tl.load(cu_seqlens_q_scale + off_z)
+    cu_seq_lens_k_scale_start = tl.load(cu_seqlens_k_scale + off_z)
+
+    q_scale_offset = cu_seq_lens_q_scale_start * H + off_h + start_m * H
+    k_scale_offset = (
+        cu_seq_lens_k_scale_start * (H // num_kv_groups) + off_h // num_kv_groups
+    )
+
+    cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+    cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+
+    kv_len = cu_seqlens_k_end - cu_seqlens_k_start
+
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, HEAD_DIM)
+    Q_ptrs = (
+        Q
+        + (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+        + offs_m[:, None] * stride_qn
+        + offs_k[None, :]
+    )
+    Q_scale_ptr = Q_scale + q_scale_offset
+    K_ptrs = (
+        K
+        + (cu_seqlens_k_start * stride_kn + (off_h // num_kv_groups) * stride_kh)
+        + offs_n[None, :] * stride_kn
+        + offs_k[:, None]
+    )
+    K_scale_ptr = K_scale + k_scale_offset
+    V_ptrs = (
+        V
+        + (cu_seqlens_k_start * stride_vn + (off_h // num_kv_groups) * stride_vh)
+        + offs_n[:, None] * stride_vn
+        + offs_k[None, :]
+    )
+    O_block_ptr = (
+        Out
+        + (cu_seqlens_q_start * stride_on + off_h * stride_oh)
+        + offs_m[:, None] * stride_on
+        + offs_k[None, :]
+    )
+
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
+    acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
+
+    q = tl.load(Q_ptrs, mask=offs_m[:, None] < qo_len)
+    q_scale = tl.load(Q_scale_ptr)
+    acc, l_i, m_i = _attn_fwd_inner(
+        acc,
+        l_i,
+        m_i,
+        q,
+        q_scale,
+        kv_len,
+        K_ptrs,
+        K_scale_ptr,
+        V_ptrs,
+        stride_kn,
+        stride_vn,
+        start_m,
+        H // num_kv_groups,
+        BLOCK_M,
+        HEAD_DIM,
+        BLOCK_N,
+        4 - STAGE,
+        offs_m,
+        offs_n,
+    )
+
+    acc, l_i, _ = _attn_fwd_inner(
+        acc,
+        l_i,
+        m_i,
+        q,
+        q_scale,
+        kv_len,
+        K_ptrs,
+        K_scale_ptr,
+        V_ptrs,
+        stride_kn,
+        stride_vn,
+        start_m,
+        H // num_kv_groups,
+        BLOCK_M,
+        HEAD_DIM,
+        BLOCK_N,
+        2,
+        offs_m,
+        offs_n,
+    )
+    acc = acc / l_i[:, None]
+    tl.store(O_block_ptr, acc.to(Out.type.element_ty), mask=(offs_m[:, None] < qo_len))
+
+
+@triton.jit
+def _attn_bwd_inner(
+    dq_acc,
+    dk_acc,
+    dv_acc,
+    l_i,
+    m_i,
+    q,
+    k,
+    v,
+    do,
+    q_scale,
+    k_scale,
+    kv_len,
+    stride_kn,
+    stride_vn,
+    start_m,
+    H,
+    BLOCK_M: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    STAGE: tl.constexpr,
+    offs_m: tl.constexpr,
+    offs_n: tl.constexpr,
+):
+    if STAGE == 1:
+        lo, hi = 0, start_m * BLOCK_M
+    elif STAGE == 2:
+        lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M
+        lo = tl.multiple_of(lo, BLOCK_M)
+        k += stride_kn * lo
+        v += stride_vn * lo
+
+    for start_n in range(lo, hi, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        k_mask = offs_n[None, :] < (kv_len - start_n)
+        k_curr = tl.load(k, mask=k_mask)
+        v_curr = tl.load(v, mask=k_mask)
+        k_scale_curr = tl.load(k_scale)
+        s = tl.dot(q, k_curr, trans_b=True).to(tl.float32) * q_scale * k_scale_curr
+
+        if STAGE == 2:
+            mask = offs_m[:, None] >= (start_n + offs_n[None, :])
+            s = s + tl.where(mask, 0.0, -float("inf"))
+            m_ij = tl.maximum(m_i, tl.max(s, 1))
+            s = s - m_ij[:, None]
+        else:
+            m_ij = tl.maximum(m_i, tl.max(s, 1))
+            s = s - m_ij[:, None]
+
+        p = tl.math.exp2(s)
+        l_ij = tl.sum(p, 1)
+        alpha = tl.math.exp2(m_i - m_ij)
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+        p = p / l_i[:, None]  # Normalize probabilities
+
+        # Compute gradients
+        # Compute softmax gradient
+        do_scaled = do / l_i[:, None]
+        dv_contrib = tl.dot(p.to(tl.float16).T, do_scaled.to(tl.float16))
+        dv_acc += dv_contrib
+
+        dp = tl.dot(do_scaled.to(tl.float16), v_curr.to(tl.float16).T)
+
+        # Compute ds (gradient w.r.t. logits s)
+        p_dp = p * dp
+        sum_p_dp = tl.sum(p_dp, axis=1)
+        ds = (p_dp - p * sum_p_dp[:, None]) * tl.math.log(2.0)  # Adjust for exp2
+
+        # Compute gradients w.r.t q and k
+        dq_contrib = tl.dot(ds.to(tl.float16), k_curr.to(tl.float16))
+        dk_contrib = tl.dot(ds.to(tl.float16).T, q.to(tl.float16))
+
+        dq_acc += dq_contrib * (q_scale * k_scale_curr)
+        dk_acc += dk_contrib * (q_scale * k_scale_curr)
+
+        k += BLOCK_N * stride_kn
+        k_scale += H
+        v += BLOCK_N * stride_vn
+
+    return dq_acc, dk_acc, dv_acc, l_i, m_i
+
+
+@triton.jit
+def _attn_bwd(
+    DO,
+    Q,
+    K,
+    V,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    Q_scale,
+    K_scale,
+    cu_seqlens_q_scale,
+    cu_seqlens_k_scale,
+    L,
+    M,
+    DQ,
+    DK,
+    DV,
+    stride_qh,
+    stride_qn,
+    stride_kh,
+    stride_kn,
+    stride_vh,
+    stride_vn,
+    H: tl.constexpr,
+    num_kv_groups: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    STAGE: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_z = tl.program_id(2).to(tl.int64)
+    off_h = tl.program_id(1).to(tl.int64)
+
+    cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+    cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+    qo_len = cu_seqlens_q_end - cu_seqlens_q_start
+
+    if (start_m * BLOCK_M) >= qo_len:
+        return
+
+    cu_seq_lens_q_scale_start = tl.load(cu_seqlens_q_scale + off_z)
+    cu_seq_lens_k_scale_start = tl.load(cu_seqlens_k_scale + off_z)
+
+    q_scale_offset = cu_seq_lens_q_scale_start * H + off_h + start_m * H
+    k_scale_offset = (
+        cu_seq_lens_k_scale_start * (H // num_kv_groups) + off_h // num_kv_groups
+    )
+
+    cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+    cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+    kv_len = cu_seqlens_k_end - cu_seqlens_k_start
+
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, HEAD_DIM)
+    Q_ptrs = (
+        Q
+        + (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+        + offs_m[:, None] * stride_qn
+        + offs_k[None, :]
+    )
+    DO_ptrs = (
+        DO
+        + (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+        + offs_m[:, None] * stride_qn
+        + offs_k[None, :]
+    )
+    Q_scale_ptr = Q_scale + q_scale_offset
+    K_ptrs = (
+        K
+        + (cu_seqlens_k_start * stride_kn + (off_h // num_kv_groups) * stride_kh)
+        + offs_n[None, :] * stride_kn
+        + offs_k[:, None]
+    )
+    K_scale_ptr = K_scale + k_scale_offset
+    V_ptrs = (
+        V
+        + (cu_seqlens_k_start * stride_vn + (off_h // num_kv_groups) * stride_vh)
+        + offs_n[:, None] * stride_vn
+        + offs_k[None, :]
+    )
+    DQ_ptrs = (
+        DQ
+        + (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+        + offs_m[:, None] * stride_qn
+        + offs_k[None, :]
+    )
+    DK_ptrs = (
+        DK
+        + (cu_seqlens_k_start * stride_kn + (off_h // num_kv_groups) * stride_kh)
+        + offs_n[None, :] * stride_kn
+        + offs_k[:, None]
+    )
+    DV_ptrs = (
+        DV
+        + (cu_seqlens_k_start * stride_vn + (off_h // num_kv_groups) * stride_vh)
+        + offs_n[:, None] * stride_vn
+        + offs_k[None, :]
+    )
+    L_ptrs = L + (cu_seqlens_q_start + offs_m)
+    M_ptrs = M + (cu_seqlens_q_start + offs_m)
+
+    m_i = tl.load(M_ptrs, mask=offs_m < qo_len, other=float("-inf"))
+    l_i = tl.load(L_ptrs, mask=offs_m < qo_len, other=1.0)
+
+    dq_acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
+    dk_acc = tl.zeros([BLOCK_N, HEAD_DIM], dtype=tl.float32)
+    dv_acc = tl.zeros([BLOCK_N, HEAD_DIM], dtype=tl.float32)
+
+    q = tl.load(Q_ptrs, mask=offs_m[:, None] < qo_len)
+    do = tl.load(DO_ptrs, mask=offs_m[:, None] < qo_len)
+    q_scale = tl.load(Q_scale_ptr)
+
+    dq_acc, dk_acc, dv_acc, l_i, m_i = _attn_bwd_inner(
+        dq_acc,
+        dk_acc,
+        dv_acc,
+        l_i,
+        m_i,
+        q,
+        K_ptrs,
+        V_ptrs,
+        do,
+        q_scale,
+        K_scale_ptr,
+        kv_len,
+        stride_kn,
+        stride_vn,
+        start_m,
+        H // num_kv_groups,
+        BLOCK_M,
+        HEAD_DIM,
+        BLOCK_N,
+        4 - STAGE,
+        offs_m,
+        offs_n,
+    )
+
+    dq_acc, dk_acc, dv_acc, l_i, m_i = _attn_bwd_inner(
+        dq_acc,
+        dk_acc,
+        dv_acc,
+        l_i,
+        m_i,
+        q,
+        K_ptrs,
+        V_ptrs,
+        do,
+        q_scale,
+        K_scale_ptr,
+        kv_len,
+        stride_kn,
+        stride_vn,
+        start_m,
+        H // num_kv_groups,
+        BLOCK_M,
+        HEAD_DIM,
+        BLOCK_N,
+        2,
+        offs_m,
+        offs_n,
+    )
+
+    tl.store(DQ_ptrs, dq_acc.to(DQ.dtype.element_ty), mask=offs_m[:, None] < qo_len)
+    tl.store(DK_ptrs, dk_acc.to(DK.dtype.element_ty), mask=offs_n[None, :] < kv_len)
+    tl.store(DV_ptrs, dv_acc.to(DV.dtype.element_ty), mask=offs_n[:, None] < kv_len)
+
+
+def forward(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    q_scale,
+    k_scale,
+    cu_seqlens_q_scale,
+    cu_seqlens_k_scale,
+    output_dtype=torch.float16,
+):
+    BLOCK_M = 128
+    BLOCK_N = 64
+    stage = 3
+
+    o = torch.empty(q.shape, dtype=output_dtype, device=q.device)
+
+    b = cu_seqlens_q.shape[0] - 1
+    _, h_qo, head_dim = q.shape
+    _, h_kv, _ = k.shape
+
+    HEAD_DIM_K = head_dim
+    num_kv_groups = h_qo // h_kv
+
+    grid = (triton.cdiv(max_seqlen_q, BLOCK_M), h_qo, b)
+    _attn_fwd[grid](
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        q_scale,
+        k_scale,
+        cu_seqlens_q_scale,
+        cu_seqlens_k_scale,
+        o,
+        q.stride(1),
+        q.stride(0),
+        k.stride(1),
+        k.stride(0),
+        v.stride(1),
+        v.stride(0),
+        o.stride(1),
+        o.stride(0),
+        h_qo,
+        num_kv_groups,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        HEAD_DIM=HEAD_DIM_K,
+        STAGE=stage,
+        num_warps=4 if head_dim == 64 else 8,
+        num_stages=4,
+    )
+    return o
+
+
+def backward(
+    do,
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    q_scale,
+    k_scale,
+    cu_seqlens_q_scale,
+    cu_seqlens_k_scale,
+    l,
+    m,
+    output_dtype=torch.float16,
+):
+    BLOCK_M = 128
+    BLOCK_N = 64
+    stage = 3
+
+    device = q.device
+    dtype = q.dtype
+    b = cu_seqlens_q.shape[0] - 1
+    _, h_qo, head_dim = q.shape
+    _, h_kv, _ = k.shape
+    num_kv_groups = h_qo // h_kv
+
+    dq = torch.zeros_like(q, dtype=output_dtype)
+    dk = torch.zeros_like(k, dtype=output_dtype)
+    dv = torch.zeros_like(v, dtype=output_dtype)
+
+    grid = (triton.cdiv(max_seqlen_q, BLOCK_M), h_qo, b)
+    _attn_bwd[grid](
+        do,
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        q_scale,
+        k_scale,
+        cu_seqlens_q_scale,
+        cu_seqlens_k_scale,
+        l,
+        m,
+        dq,
+        dk,
+        dv,
+        q.stride(1),
+        q.stride(0),
+        k.stride(1),
+        k.stride(0),
+        v.stride(1),
+        v.stride(0),
+        h_qo,
+        num_kv_groups,
+        HEAD_DIM=head_dim,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        STAGE=stage,
+        num_warps=4 if head_dim == 64 else 8,
+        num_stages=4,
+    )
+    return dq, dk, dv
+
+
+# class TritonAttentionFunction(torch.autograd.Function):
+#     @staticmethod
+#     def forward(ctx, q, k, v, cu_seqlens_q, cu_seqlens_k, q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale):
+#         l = torch.zeros(q.shape[0], device=q.device, dtype=torch.float32)
+#         m = torch.zeros(q.shape[0], device=q.device, dtype=torch.float32)
+#         output = forward(q, k, v, cu_seqlens_q, cu_seqlens_k, q.shape[0], q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale, l, m)
+#         ctx.save_for_backward(q, k, v, cu_seqlens_q, cu_seqlens_k, q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale, l, m)
+#         return output
+#
+#     @staticmethod
+#     def backward(ctx, do):
+#         q, k, v, cu_seqlens_q, cu_seqlens_k, q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale, l, m = ctx.saved_tensors
+#         dq, dk, dv = backward(
+#             do, q, k, v,
+#             cu_seqlens_q, cu_seqlens_k,
+#             q.shape[0], q_scale, k_scale,
+#             cu_seqlens_q_scale, cu_seqlens_k_scale,
+#             l, m,
+#         )
+#         return dq, dk, dv, None, None, None, None, None, None
--- a/src/axolotl/integrations/sageattention/lib/triton/quant_per_block_varlen.py
+++ b/src/axolotl/integrations/sageattention/lib/triton/quant_per_block_varlen.py
@@ -0,0 +1,158 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def quant_per_block_int8_kernel(
+    Input,
+    Output,
+    Scale,
+    cu_seqlens_input,
+    cu_seqlens_scale,
+    stride_ih,
+    stride_in,
+    stride_oh,
+    stride_on,
+    sm_scale,
+    H: tl.constexpr,
+    C: tl.constexpr,
+    BLK: tl.constexpr,
+):
+    off_blk = tl.program_id(0)
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    cu_seqlens_input_start = tl.load(cu_seqlens_input + off_b)
+    cu_seqlens_input_end = tl.load(cu_seqlens_input + off_b + 1)
+
+    L = cu_seqlens_input_end - cu_seqlens_input_start
+
+    if (off_blk * BLK) >= L:
+        return
+
+    cu_seqlens_scale_start = tl.load(cu_seqlens_scale + off_b)
+
+    offs_n = off_blk * BLK + tl.arange(0, BLK)
+    offs_k = tl.arange(0, C)
+
+    input_ptrs = (
+        Input
+        + cu_seqlens_input_start * stride_in
+        + off_h * stride_ih
+        + offs_n[:, None] * stride_in
+        + offs_k[None, :]
+    )
+    output_ptrs = (
+        Output
+        + cu_seqlens_input_start * stride_on
+        + off_h * stride_oh
+        + offs_n[:, None] * stride_on
+        + offs_k[None, :]
+    )
+    scale_ptrs = Scale + cu_seqlens_scale_start * H + off_h + off_blk * H
+
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    x *= sm_scale
+    scale = tl.max(tl.abs(x)) / 127.0
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+
+
+def per_block_int8(
+    q,
+    k,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    BLKQ=128,
+    BLKK=64,
+    sm_scale=None,
+):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+
+    h_qo = q.shape[1]
+    h_kv = k.shape[1]
+    head_dim = q.shape[-1]
+
+    b = cu_seqlens_q.shape[0] - 1
+    q_batch_len = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
+    k_batch_len = cu_seqlens_k[1:] - cu_seqlens_k[:-1]
+
+    q_scale_len = (q_batch_len + BLKQ - 1) // BLKQ
+    k_scale_len = (k_batch_len + BLKK - 1) // BLKK
+
+    cu_seqlens_q_scale = torch.nn.functional.pad(
+        torch.cumsum(q_scale_len, dim=0), (1, 0), value=0
+    )
+    cu_seqlens_k_scale = torch.nn.functional.pad(
+        torch.cumsum(k_scale_len, dim=0), (1, 0), value=0
+    )
+
+    q_scale = torch.empty(
+        (cu_seqlens_q_scale[-1], h_qo), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (cu_seqlens_k_scale[-1], h_kv), device=k.device, dtype=torch.float32
+    )
+
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+
+    grid = ((max_seqlen_q + BLKQ - 1) // BLKQ, h_qo, b)
+    quant_per_block_int8_kernel[grid](
+        q,
+        q_int8,
+        q_scale,
+        cu_seqlens_q,
+        cu_seqlens_q_scale,
+        q.stride(1),
+        q.stride(0),
+        q_int8.stride(1),
+        q_int8.stride(0),
+        sm_scale=(sm_scale * 1.44269504),
+        H=h_qo,
+        C=head_dim,
+        BLK=BLKQ,
+    )
+
+    grid = ((max_seqlen_k + BLKK - 1) // BLKK, h_kv, b)
+    quant_per_block_int8_kernel[grid](
+        k,
+        k_int8,
+        k_scale,
+        cu_seqlens_k,
+        cu_seqlens_k_scale,
+        k.stride(1),
+        k.stride(0),
+        k_int8.stride(1),
+        k_int8.stride(0),
+        sm_scale=1.0,
+        H=h_kv,
+        C=head_dim,
+        BLK=BLKK,
+    )
+
+    return q_int8, q_scale, k_int8, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -46,6 +46,7 @@ from transformers.integrations.deepspeed import (
 )

 from axolotl.common.architectures import MOE_ARCH_BLOCK
+from axolotl.integrations.sageattention.lib.core import monkeypatch_sdp_w_sage_attention
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
@@ -707,6 +708,7 @@ class ModelLoader:
            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "sdpa"
            )
+            monkeypatch_sdp_w_sage_attention()
        elif self.cfg.eager_attention:
            self.model_kwargs["attn_implementation"] = "eager"
            self.model_config._attn_implementation = (  # pylint: disable=protected-access