[gemma4] use mixed Flash Attention and SDPA and add fused RMSNorm+RoPE Triton kernels (#3598)

2026-04-12 10:29:55 -04:00
parent e079cf16a2
commit b8358aa5ab
6 changed files with 993 additions and 1 deletions
--- a/src/axolotl/kernels/gemma4_fused_rope.py
+++ b/src/axolotl/kernels/gemma4_fused_rope.py
@@ -0,0 +1,529 @@
 """
 Fused RMSNorm + RoPE Triton kernel for Gemma 4.
 Fuses three operations into one kernel launch:
  1. RMSNorm: x_norm = (x / sqrt(mean(x^2) + eps)) * weight
  2. RoPE:    y = x_norm * cos + rotate_half(x_norm) * sin
  3. (optional) RMSNorm without scale (for v_norm)
 This eliminates two intermediate tensor materializations per Q/K path;
 churn from rotate_half / apply_rotary_pos_emb.
 Shapes:
  X:      (rows, head_dim)  — flattened from (batch, seq_len, num_heads, head_dim)
  W:      (head_dim,)       — RMSNorm weight (None for with_scale=False)
  cos:    (rows, head_dim)  — flattened from (batch, seq_len, 1, head_dim) after broadcast
  sin:    (rows, head_dim)  — same as cos
 """
 import math
 import operator
 import torch
 import triton
 import triton.language as tl
 from liger_kernel.ops.utils import (
    calculate_settings,
    compare_version,
    ensure_contiguous,
    torch_to_triton_dtype,
 )
 from liger_kernel.utils import is_npu_available
 if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
    try:
        from triton.language.extra.libdevice import rsqrt
    except ModuleNotFoundError:
        from triton.language.extra.cuda.libdevice import rsqrt
 else:
    from triton.language.math import rsqrt
@triton.jit
 def _rms_norm_rope_forward_kernel(
    Y_ptr,
    Y_row_stride,
    X_ptr,
    X_row_stride,
    W_ptr,
    COS_ptr,
    COS_row_stride,
    SIN_ptr,
    SIN_row_stride,
    RSTD_ptr,
    RSTD_row_stride,
    n_cols,
    n_heads,
    eps,
    HAS_WEIGHT: tl.constexpr,
    BLOCK_SIZE: tl.constexpr,
 ):
    """
    Fused forward:
      x_norm = x / rms(x) [* weight]   (RMSNorm)
      y = x_norm * cos + rotate_half(x_norm) * sin  (RoPE)
    rotate_half swaps first/second halves and negates the first:
      rotate_half([a, b]) = [-b, a]
    cos/sin are indexed by row_idx // n_heads to handle per-head broadcast
    (cos/sin have shape (B*S, D) while X has shape (B*S*H, D)).
    """
    row_idx = tl.program_id(0).to(tl.int64)
    # cos/sin row: divide by n_heads since cos/sin are (B*S, D)
    cs_row_idx = row_idx // n_heads
    col_offsets = tl.arange(0, BLOCK_SIZE)
    mask = col_offsets < n_cols
    half_dim = n_cols // 2
    # Load input row
    X_row = tl.load(X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0)
    X_dtype = X_row.dtype
    X_fp32 = X_row.to(tl.float32)
    # RMSNorm: compute 1/rms
    mean_sq = tl.sum(X_fp32 * X_fp32, axis=0) / n_cols
    rstd = rsqrt(mean_sq + eps)
    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd)
    # Normalize
    X_norm = X_fp32 * rstd
    # Apply weight if present (with_scale=True)
    if HAS_WEIGHT:
        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
        X_norm = X_norm * W_row
    # RoPE: load cos/sin (broadcast across heads)
    cos_row = tl.load(
        COS_ptr + cs_row_idx * COS_row_stride + col_offsets, mask=mask, other=0
    ).to(tl.float32)
    sin_row = tl.load(
        SIN_ptr + cs_row_idx * SIN_row_stride + col_offsets, mask=mask, other=0
    ).to(tl.float32)
    # rotate_half: for col < half_dim, take -X_norm[col + half_dim]
    #              for col >= half_dim, take  X_norm[col - half_dim]
    rot_offsets = tl.where(
        col_offsets < half_dim, col_offsets + half_dim, col_offsets - half_dim
    )
    rot_mask = rot_offsets < n_cols
    X_rot = tl.load(
        X_ptr + row_idx * X_row_stride + rot_offsets, mask=rot_mask & mask, other=0
    ).to(tl.float32)
    # Re-normalize the rotated values
    X_rot_norm = X_rot * rstd
    if HAS_WEIGHT:
        W_rot = tl.load(W_ptr + rot_offsets, mask=rot_mask & mask, other=0).to(
            tl.float32
        )
        X_rot_norm = X_rot_norm * W_rot
    # Negate the first half (rotate_half negates x2, which becomes the first half)
    sign = tl.where(col_offsets < half_dim, -1.0, 1.0)
    X_rot_norm = X_rot_norm * sign
    # Final RoPE: y = x_norm * cos + rotate_half(x_norm) * sin
    Y_row = X_norm * cos_row + X_rot_norm * sin_row
    tl.store(
        Y_ptr + row_idx * Y_row_stride + col_offsets,
        Y_row.to(X_dtype),
        mask=mask,
    )
@triton.jit
 def _rms_norm_rope_backward_kernel(
    dY_ptr,
    dY_row_stride,
    dX_ptr,
    dX_row_stride,
    X_ptr,
    X_row_stride,
    X_dtype: tl.constexpr,
    W_ptr,
    COS_ptr,
    COS_row_stride,
    SIN_ptr,
    SIN_row_stride,
    RSTD_ptr,
    RSTD_row_stride,
    dW_ptr,
    dW_row_stride,
    n_rows,
    n_cols,
    n_heads,
    rows_per_program,
    HAS_WEIGHT: tl.constexpr,
    BLOCK_SIZE: tl.constexpr,
 ):
    """
    Backward for Y = RoPE(RMSNorm(X, W))
    cos/sin indexed by row_idx // n_heads for per-head broadcast.
    """
    row_block_id = tl.program_id(0).to(tl.int64)
    row_start = row_block_id * rows_per_program
    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
    col_offsets = tl.arange(0, BLOCK_SIZE)
    mask = col_offsets < n_cols
    half_dim = n_cols // 2
    dW_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
    if HAS_WEIGHT:
        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
    for row_idx in range(row_start, row_end):
        cs_row_idx = row_idx // n_heads
        dY_row = tl.load(
            dY_ptr + row_idx * dY_row_stride + col_offsets, mask=mask, other=0
        ).to(tl.float32)
        X_row = tl.load(
            X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0
        ).to(tl.float32)
        rstd = tl.load(RSTD_ptr + row_idx * RSTD_row_stride)
        cos_row = tl.load(
            COS_ptr + cs_row_idx * COS_row_stride + col_offsets, mask=mask, other=0
        ).to(tl.float32)
        # dN = dY * cos + rotate_half^T(dY * sin)
        # rotate_half^T([a, b]) = [b, -a]  (adjoint of rotate_half)
        #
        # Compute rotate_half_transpose(dY * sin) by loading dY and sin at
        # rotated offsets directly:  dY[rot] * sin[rot] * adj_sign
        # This is equivalent to rotating (dY * sin) because the rotation
        # just permutes which elements are multiplied.
        rot_offsets = tl.where(
            col_offsets < half_dim, col_offsets + half_dim, col_offsets - half_dim
        )
        rot_mask = rot_offsets < n_cols
        dY_rot = tl.load(
            dY_ptr + row_idx * dY_row_stride + rot_offsets,
            mask=rot_mask & mask,
            other=0,
        ).to(tl.float32)
        sin_rot = tl.load(
            SIN_ptr + cs_row_idx * SIN_row_stride + rot_offsets,
            mask=rot_mask & mask,
            other=0,
        ).to(tl.float32)
        adj_sign = tl.where(col_offsets < half_dim, 1.0, -1.0)
        dN = dY_row * cos_row + dY_rot * sin_rot * adj_sign
        # Pre-weight normalized: n = rstd * x
        n = X_row * rstd
        if HAS_WEIGHT:
            dW_acc += dN * n
            dm = dN * W_row
        else:
            dm = dN
        # RMSNorm backward: dX = rstd * (dm - (1/n_cols) * rstd^2 * dot(dm, X) * X)
        dot_dm_x = tl.sum(dm * X_row, axis=0)
        dX_row = rstd * (dm - (1.0 / n_cols) * rstd * rstd * dot_dm_x * X_row)
        tl.store(
            dX_ptr + row_idx * dX_row_stride + col_offsets,
            dX_row.to(X_dtype),
            mask=mask,
        )
    if HAS_WEIGHT:
        tl.store(
            dW_ptr + row_block_id * dW_row_stride + col_offsets,
            dW_acc,
            mask=mask,
        )
 def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads):
    """
    Args:
        X:   (B*S*H, head_dim) — contiguous, flattened from (B, S, H, D)
        W:   (head_dim,) or None — RMSNorm weight
        cos: (B*S, head_dim) — position embeddings (broadcast across heads)
        sin: (B*S, head_dim) — position embeddings (broadcast across heads)
        eps: float
        n_heads: int — number of attention heads (for cos/sin indexing)
    Returns:
        Y, X_saved, RSTD, BLOCK_SIZE, num_warps
    """
    n_rows, n_cols = X.shape
    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
    has_weight = W is not None
    Y = torch.empty_like(X)
    RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
    _rms_norm_rope_forward_kernel[(n_rows,)](
        Y,
        Y.stride(0),
        X,
        X.stride(0),
        W if has_weight else X,  # dummy pointer when no weight
        cos,
        cos.stride(0),
        sin,
        sin.stride(0),
        RSTD,
        RSTD.stride(0),
        n_cols,
        n_heads,
        eps,
        HAS_WEIGHT=has_weight,
        BLOCK_SIZE=BLOCK_SIZE,
        num_warps=num_warps,
    )
    return Y, X, RSTD, BLOCK_SIZE, num_warps
 def rms_norm_rope_backward(dY, X, W, cos, sin, RSTD, n_heads, BLOCK_SIZE, num_warps):
    n_rows, n_cols = dY.shape
    has_weight = W is not None
    sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
    rows_per_program = math.ceil(n_rows / sm_count)
    dX = torch.empty_like(X)
    if has_weight:
        _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=X.device)
    else:
        _dW = torch.empty((1, n_cols), dtype=torch.float32, device=X.device)
    _rms_norm_rope_backward_kernel[(sm_count,)](
        dY,
        dY.stride(0),
        dX,
        dX.stride(0),
        X,
        X.stride(0),
        torch_to_triton_dtype[X.dtype],
        W if has_weight else X,  # dummy
        cos,
        cos.stride(0),
        sin,
        sin.stride(0),
        RSTD,
        RSTD.stride(0),
        _dW,
        _dW.stride(0),
        n_rows,
        n_cols,
        n_heads,
        rows_per_program,
        HAS_WEIGHT=has_weight,
        BLOCK_SIZE=BLOCK_SIZE,
        num_warps=num_warps,
    )
    dW = _dW.sum(dim=0).to(W.dtype) if has_weight else None
    return dX, dW
 class FusedRMSNormRoPEFunction(torch.autograd.Function):
    @staticmethod
    @ensure_contiguous
    def forward(ctx, X, W, cos, sin, eps, n_heads):
        """
        X:   (B*S*H, head_dim)
        W:   (head_dim,) or None
        cos: (B*S, head_dim) — broadcast across heads
        sin: (B*S, head_dim) — broadcast across heads
        n_heads: int
        """
        Y, X_saved, RSTD, BLOCK_SIZE, num_warps = rms_norm_rope_forward(
            X,
            W,
            cos,
            sin,
            eps,
            n_heads,
        )
        ctx.eps = eps
        ctx.BLOCK_SIZE = BLOCK_SIZE
        ctx.num_warps = num_warps
        ctx.n_heads = n_heads
        ctx.has_weight = W is not None
        ctx.save_for_backward(X_saved, W, cos, sin, RSTD)
        return Y
    @staticmethod
    @ensure_contiguous
    def backward(ctx, dY):
        X, W, cos, sin, RSTD = ctx.saved_tensors
        dX, dW = rms_norm_rope_backward(
            dY,
            X,
            W,
            cos,
            sin,
            RSTD,
            ctx.n_heads,
            ctx.BLOCK_SIZE,
            ctx.num_warps,
        )
        return dX, dW, None, None, None, None
 def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
    """
    Apply fused RMSNorm + RoPE.
    Args:
        x:      (batch, seq_len, num_heads, head_dim) — after projection + view
        weight: (head_dim,) — RMSNorm weight, or None for no-scale norm
        cos:    (batch, seq_len, head_dim) — from RotaryEmbedding
        sin:    (batch, seq_len, head_dim) — from RotaryEmbedding
        eps:    float — RMSNorm epsilon
    Returns:
        y: (batch, seq_len, num_heads, head_dim) — normalized + rotated
    """
    shape = x.shape  # (B, S, H, D)
    B, S, H, D = shape
    # Flatten to 2D: (B*S*H, D)
    x_flat = x.reshape(-1, D).contiguous()
    # Flatten cos/sin to (B*S, D) — the kernel will handle per-head broadcast
    # by dividing the row_idx by H to get the cos/sin row
    cos_flat = cos.reshape(B * S, D).contiguous()
    sin_flat = sin.reshape(B * S, D).contiguous()
    y_flat = FusedRMSNormRoPEFunction.apply(x_flat, weight, cos_flat, sin_flat, eps, H)
    return y_flat.view(shape)
@triton.jit
 def _rms_norm_forward_kernel(
    Y_ptr,
    Y_row_stride,
    X_ptr,
    X_row_stride,
    RSTD_ptr,
    RSTD_row_stride,
    n_cols,
    eps,
    BLOCK_SIZE: tl.constexpr,
 ):
    """RMSNorm without scale weight: y = x / rms(x)"""
    row_idx = tl.program_id(0).to(tl.int64)
    col_offsets = tl.arange(0, BLOCK_SIZE)
    mask = col_offsets < n_cols
    X_row = tl.load(X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0)
    X_dtype = X_row.dtype
    X_fp32 = X_row.to(tl.float32)
    mean_sq = tl.sum(X_fp32 * X_fp32, axis=0) / n_cols
    rstd = rsqrt(mean_sq + eps)
    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd)
    Y_row = X_fp32 * rstd
    tl.store(Y_ptr + row_idx * Y_row_stride + col_offsets, Y_row.to(X_dtype), mask=mask)
@triton.jit
 def _rms_norm_noscale_backward_kernel(
    dY_ptr,
    dY_row_stride,
    dX_ptr,
    dX_row_stride,
    X_ptr,
    X_row_stride,
    X_dtype: tl.constexpr,
    RSTD_ptr,
    RSTD_row_stride,
    n_cols,
    BLOCK_SIZE: tl.constexpr,
 ):
    """Backward for y = x * rstd (no weight)."""
    row_idx = tl.program_id(0).to(tl.int64)
    col_offsets = tl.arange(0, BLOCK_SIZE)
    mask = col_offsets < n_cols
    dY_row = tl.load(
        dY_ptr + row_idx * dY_row_stride + col_offsets, mask=mask, other=0
    ).to(tl.float32)
    X_row = tl.load(
        X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0
    ).to(tl.float32)
    rstd = tl.load(RSTD_ptr + row_idx * RSTD_row_stride)
    dot_dy_x = tl.sum(dY_row * X_row, axis=0)
    dX_row = rstd * (dY_row - (1.0 / n_cols) * rstd * rstd * dot_dy_x * X_row)
    tl.store(
        dX_ptr + row_idx * dX_row_stride + col_offsets, dX_row.to(X_dtype), mask=mask
    )
 class FusedRMSNormNoScaleFunction(torch.autograd.Function):
    """RMSNorm without learnable scale — used for Gemma4's v_norm."""
    @staticmethod
    @ensure_contiguous
    def forward(ctx, X, eps):
        n_rows, n_cols = X.shape
        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
        Y = torch.empty_like(X)
        RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
        _rms_norm_forward_kernel[(n_rows,)](
            Y,
            Y.stride(0),
            X,
            X.stride(0),
            RSTD,
            RSTD.stride(0),
            n_cols,
            eps,
            BLOCK_SIZE=BLOCK_SIZE,
            num_warps=num_warps,
        )
        ctx.BLOCK_SIZE = BLOCK_SIZE
        ctx.num_warps = num_warps
        ctx.save_for_backward(X, RSTD)
        ctx.n_cols = n_cols
        return Y
    @staticmethod
    @ensure_contiguous
    def backward(ctx, dY):
        X, RSTD = ctx.saved_tensors
        n_rows = X.shape[0]
        dX = torch.empty_like(X)
        _rms_norm_noscale_backward_kernel[(n_rows,)](
            dY,
            dY.stride(0),
            dX,
            dX.stride(0),
            X,
            X.stride(0),
            torch_to_triton_dtype[X.dtype],
            RSTD,
            RSTD.stride(0),
            ctx.n_cols,
            BLOCK_SIZE=ctx.BLOCK_SIZE,
            num_warps=ctx.num_warps,
        )
        return dX, None
 def fused_rms_norm_noscale(x, eps=1e-6):
    """
    RMSNorm without scale for v_norm.
    Args:
        x: (batch, seq_len, num_heads, head_dim)
    Returns:
        y: same shape, normalized
    """
    shape = x.shape
    x_flat = x.reshape(-1, shape[-1])
    y_flat = FusedRMSNormNoScaleFunction.apply(x_flat, eps)
    return y_flat.view(shape)
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -624,7 +624,14 @@ class ModelLoader:
    def _set_attention_config(self):
        """Sample packing uses custom FA2 patch"""
-        if self.cfg.attn_implementation:
+        if self.cfg.gemma4_hybrid_attn_impl:
            # Load model with flash_attention_2 for sliding window layers;
            # global layers will be patched to sdpa post-load.
            self.model_kwargs["attn_implementation"] = "flash_attention_2"
            self.model_config._attn_implementation = "flash_attention_2"
            # Set flash_attention so multipack/sample_packing patches activate
            self.cfg.flash_attention = True
        elif self.cfg.attn_implementation:
            self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation
        elif self.cfg.flex_attention:
            self.model_kwargs["attn_implementation"] = "flex_attention"
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -156,6 +156,7 @@ class PatchManager:
            # which would clobber any earlier fix.
            self._fix_nemotron_h_conversion_mapping()
        self._apply_gemma_hybrid_attention(model)
        self._finalize_moe_expert_quantization(model)
    def apply_post_model_load_patches(self, model: PreTrainedModel):
@@ -165,6 +166,72 @@ class PatchManager:
        self._apply_lora_kernel_patch(model)
        self._apply_scaling_softmax_patch(model)
    def _apply_gemma_hybrid_attention(self, model: PreTrainedModel):
        """Apply hybrid attention: FA2 for sliding window layers, SDPA for global layers.
        Gemma 4 has global (full_attention) layers with head_dim=512
        which exceeds flash attention's supported size. This patch loads the model
        with flash_attention_2 for the sliding window layers (head_dim=256), then
        gives each global layer a shallow-copied config with _attn_implementation="sdpa".
        """
        if not self.cfg.gemma4_hybrid_attn_impl:
            return
        import copy
        # Navigate to the module that has 'layers' - varies by model structure:
        # Gemma4ForConditionalGeneration -> .model (Gemma4Model) -> .language_model (Gemma4TextModel) -> .layers
        # Gemma4ForCausalLM -> .model (Gemma4TextModel) -> .layers
        layers = None
        config_source = None
        for candidate in [model, getattr(model, "model", None)]:
            if candidate is None:
                continue
            # Check direct layers
            if hasattr(candidate, "layers"):
                layers = candidate.layers
                config_source = candidate
                break
            # Check language_model.layers (multimodal wrapper)
            lang_model = getattr(candidate, "language_model", None)
            if lang_model is not None and hasattr(lang_model, "layers"):
                layers = lang_model.layers
                config_source = lang_model
                break
        if layers is None:
            LOG.warning(
                "gemma4_hybrid_attn_impl: could not find decoder layers in model, skipping"
            )
            return
        config = getattr(config_source, "config", self.model_config)
        layer_types = getattr(config, "layer_types", None)
        if layer_types is None:
            LOG.warning(
                "gemma4_hybrid_attn_impl: model config has no 'layer_types', skipping. "
                "This feature requires a model with mixed sliding/global attention layers."
            )
            return
        patched_count = 0
        for layer_idx, layer in enumerate(layers):
            if layer_types[layer_idx] != "sliding_attention":
                # Global / full_attention layer - use SDPA instead of FA2
                attn_module = getattr(layer, "self_attn", None)
                if attn_module is not None and hasattr(attn_module, "config"):
                    sdpa_config = copy.copy(attn_module.config)
                    sdpa_config._attn_implementation = "sdpa"
                    attn_module.config = sdpa_config
                    patched_count += 1
        LOG.info(
            "gemma4_hybrid_attn_impl: patched %d global layers to use SDPA "
            "(remaining %d sliding layers use flash_attention_2)",
            patched_count,
            len(layers) - patched_count,
        )
    def _apply_flash_attention_patches(self):
        """Apply patches related to Flash Attention."""
        if self.cfg.xformers_attention and self.cfg.sample_packing:
@@ -324,6 +391,13 @@ class PatchManager:
                patch_qwen3_5_vlm_flash_attention()
            if self.cfg.model_config_type in ("gemma4", "gemma4_text"):
                from axolotl.monkeypatch.models.gemma4.fused_attn import (
                    patch_gemma4_fused_attn,
                )
                patch_gemma4_fused_attn()
    @staticmethod
    def _fix_nemotron_h_conversion_mapping():
        """Remove the spurious embedding→embeddings WeightRenaming from the
--- a/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
+++ b/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
@@ -0,0 +1,147 @@
 """
 Gemma 4 fused attention monkeypatch.
 Replaces the per-layer RMSNorm + RoPE + transpose sequence with fused Triton
 kernels, eliminating intermediate tensor allocations from rotate_half / apply_rotary_pos_emb
 Usage:
    from axolotl.monkeypatch.models.gemma4.fused_attn import patch_gemma4_fused_attn
    patch_gemma4_fused_attn()
 """
 import logging
 from typing import Callable
 import torch
 logger = logging.getLogger(__name__)
 def _make_fused_forward(original_forward):
    """Create a patched forward that uses fused RMSNorm+RoPE kernels."""
    from axolotl.kernels.gemma4_fused_rope import (
        fused_rms_norm_noscale,
        fused_rms_norm_rope,
    )
    def fused_forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: torch.Tensor,
        attention_mask: torch.Tensor | None,
        shared_kv_states: dict[int, tuple[torch.Tensor, torch.Tensor]],
        past_key_values=None,
        **kwargs,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
        from transformers.models.gemma4.modeling_gemma4 import (
            eager_attention_forward,
        )
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        eps = self.config.rms_norm_eps
        cos, sin = position_embeddings
        # ---- Projections ----
        # Use apply_qkv if present (LoRA kernel patch), otherwise direct proj
        has_lora_qkv = hasattr(self, "apply_qkv")
        if has_lora_qkv:
            query_states, key_states, value_states = self.apply_qkv(hidden_states)
            query_states = query_states.view(hidden_shape)
        else:
            query_states = self.q_proj(hidden_states).view(hidden_shape)
        # ---- Q path: fused q_norm + RoPE ----
        query_states = fused_rms_norm_rope(
            query_states,
            self.q_norm.weight,
            cos,
            sin,
            eps=eps,
        )
        query_states = query_states.transpose(1, 2)
        # ---- K/V path ----
        if self.is_kv_shared_layer:
            key_states, value_states = shared_kv_states[self.kv_shared_layer_index]
            key_states = key_states.to(query_states.device)
            value_states = value_states.to(query_states.device)
        else:
            if has_lora_qkv:
                # apply_qkv already computed k/v projections
                key_states = key_states.view(hidden_shape)
                value_states = (
                    value_states.view(hidden_shape)
                    if self.v_proj is not None
                    else key_states
                )
            else:
                key_states = self.k_proj(hidden_states).view(hidden_shape)
                value_states = (
                    self.v_proj(hidden_states).view(hidden_shape)
                    if self.v_proj is not None
                    else key_states
                )
            # Fused k_norm + RoPE
            key_states = fused_rms_norm_rope(
                key_states,
                self.k_norm.weight,
                cos,
                sin,
                eps=eps,
            )
            key_states = key_states.transpose(1, 2)
            # Fused v_norm (no scale, no RoPE)
            value_states = fused_rms_norm_noscale(value_states, eps=eps)
            value_states = value_states.transpose(1, 2)
        if past_key_values is not None and not self.is_kv_shared_layer:
            key_states, value_states = past_key_values.update(
                key_states, value_states, self.layer_idx
            )
        if self.store_full_length_kv:
            shared_kv_states[self.layer_idx] = key_states, value_states
        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[
                self.config._attn_implementation
            ]
        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=self.attention_dropout if self.training else 0.0,
            scaling=self.scaling,
            sliding_window=self.sliding_window,
            **kwargs,
        )
        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output, attn_weights
    return fused_forward
 def patch_gemma4_fused_attn():
    """
    Monkeypatch Gemma4TextAttention.forward to use fused RMSNorm+RoPE kernels.
    """
    from transformers.models.gemma4.modeling_gemma4 import Gemma4TextAttention
    original_forward = Gemma4TextAttention.forward
    Gemma4TextAttention.forward = _make_fused_forward(original_forward)
    logger.info(
        "Patched Gemma4TextAttention.forward with fused RMSNorm+RoPE Triton kernels"
    )
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -777,6 +777,15 @@ class AxolotlInputConfig(
        },
    )
    gemma4_hybrid_attn_impl: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Use hybrid attention for Gemma 4: flash_attention_2 for sliding window layers "
            "and sdpa for global (full_attention) layers. Global layers have head_dim=512 which "
            "exceeds flash attention's supported size."
        },
    )
    experts_implementation: str | None = Field(
        default=None,
        json_schema_extra={
--- a/tests/kernels/test_gemma4_fused_rope.py
+++ b/tests/kernels/test_gemma4_fused_rope.py
@@ -0,0 +1,226 @@
 """
 Correctness tests for the fused RMSNorm+RoPE Triton kernel.
 Tests forward and backward against the reference Gemma4 implementation
 (Gemma4RMSNorm + apply_rotary_pos_emb) across both sliding window
 (head_dim=256) and global attention (head_dim=512) layer configurations.
 """
 import pytest
 import torch
 torch.manual_seed(42)
 # Skip entire module if no CUDA
 pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
 def _reference_norm_rope(x, weight, cos, sin, eps):
    """Reference: separate Gemma4RMSNorm + apply_rotary_pos_emb."""
    from transformers.models.gemma4.modeling_gemma4 import (
        Gemma4RMSNorm,
        apply_rotary_pos_emb,
    )
    D = x.shape[-1]
    norm = Gemma4RMSNorm(D, eps=eps).to(x.device, x.dtype)
    norm.weight.data.copy_(weight)
    normed = norm(x)
    return apply_rotary_pos_emb(normed, cos, sin, unsqueeze_dim=2)
 def _reference_norm_noscale(x, eps):
    """Reference: Gemma4RMSNorm with_scale=False."""
    from transformers.models.gemma4.modeling_gemma4 import Gemma4RMSNorm
    D = x.shape[-1]
    norm = Gemma4RMSNorm(D, eps=eps, with_scale=False).to(x.device, x.dtype)
    return norm(x)
@pytest.fixture(
    params=[
        (2, 64, 32, 256),  # sliding window layer shape
        (2, 64, 4, 512),  # global attention layer shape
        (1, 128, 16, 256),  # different batch/seq
        (1, 1, 1, 8),  # minimal size
    ],
    ids=["sliding_256", "global_512", "varied", "minimal"],
 )
 def shapes(request):
    return request.param
@pytest.fixture(params=[torch.bfloat16, torch.float16], ids=["bf16", "fp16"])
 def dtype(request):
    return request.param
 class TestFusedRMSNormRoPEForward:
    """Forward pass correctness."""
    def test_matches_reference(self, shapes, dtype):
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
        B, S, H, D = shapes
        eps = 1e-6
        x = torch.randn(B, S, H, D, device="cuda", dtype=dtype)
        weight = torch.randn(D, device="cuda", dtype=dtype)
        cos = torch.randn(B, S, D, device="cuda", dtype=dtype)
        sin = torch.randn(B, S, D, device="cuda", dtype=dtype)
        y_ref = _reference_norm_rope(x.clone(), weight, cos, sin, eps)
        y_fused = fused_rms_norm_rope(x.clone(), weight, cos, sin, eps=eps)
        cos_sim = torch.nn.functional.cosine_similarity(
            y_ref.flatten().float(), y_fused.flatten().float(), dim=0
        )
        assert cos_sim > 0.999, f"Forward cosine_sim={cos_sim:.6f}, expected > 0.999"
    def test_output_shape(self, shapes):
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
        B, S, H, D = shapes
        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16)
        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        y = fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6)
        assert y.shape == x.shape
        assert y.dtype == x.dtype
 class TestFusedRMSNormRoPEBackward:
    """Backward pass correctness via gradient comparison."""
    @pytest.mark.parametrize(
        "B,S,H,D",
        [(2, 64, 32, 256), (2, 64, 4, 512)],
        ids=["sliding_256", "global_512"],
    )
    def test_x_grad_matches_reference(self, B, S, H, D):
        from transformers.models.gemma4.modeling_gemma4 import (
            Gemma4RMSNorm,
            apply_rotary_pos_emb,
        )
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
        eps = 1e-6
        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        weight_init = torch.randn(D, device="cuda", dtype=torch.bfloat16)
        # Reference backward
        x_ref = torch.randn(
            B, S, H, D, device="cuda", dtype=torch.bfloat16, requires_grad=True
        )
        norm_ref = Gemma4RMSNorm(D, eps=eps).cuda().to(torch.bfloat16)
        norm_ref.weight.data.copy_(weight_init)
        y_ref = apply_rotary_pos_emb(norm_ref(x_ref), cos, sin, unsqueeze_dim=2)
        y_ref.sum().backward()
        # Fused backward
        x_fused = x_ref.data.clone().requires_grad_(True)
        w_fused = weight_init.clone().requires_grad_(True)
        y_fused = fused_rms_norm_rope(x_fused, w_fused, cos, sin, eps=eps)
        y_fused.sum().backward()
        cos_sim_x = torch.nn.functional.cosine_similarity(
            x_fused.grad.flatten().float(), x_ref.grad.flatten().float(), dim=0
        )
        assert cos_sim_x > 0.999, f"x grad cosine_sim={cos_sim_x:.6f}, expected > 0.999"
    @pytest.mark.parametrize(
        "B,S,H,D",
        [(2, 64, 32, 256), (2, 64, 4, 512)],
        ids=["sliding_256", "global_512"],
    )
    def test_weight_grad_matches_reference(self, B, S, H, D):
        from transformers.models.gemma4.modeling_gemma4 import (
            Gemma4RMSNorm,
            apply_rotary_pos_emb,
        )
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
        eps = 1e-6
        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        weight_init = torch.randn(D, device="cuda", dtype=torch.bfloat16)
        # Reference
        x_ref = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
        norm_ref = Gemma4RMSNorm(D, eps=eps).cuda().to(torch.bfloat16)
        norm_ref.weight = torch.nn.Parameter(weight_init.clone())
        apply_rotary_pos_emb(
            norm_ref(x_ref), cos, sin, unsqueeze_dim=2
        ).sum().backward()
        # Fused
        w_fused = weight_init.clone().requires_grad_(True)
        fused_rms_norm_rope(x_ref.clone(), w_fused, cos, sin, eps=eps).sum().backward()
        cos_sim_w = torch.nn.functional.cosine_similarity(
            w_fused.grad.flatten().float(),
            norm_ref.weight.grad.flatten().float(),
            dim=0,
        )
        assert cos_sim_w > 0.995, (
            f"weight grad cosine_sim={cos_sim_w:.6f}, expected > 0.995"
        )
    def test_grad_flows(self):
        """Verify gradients are non-zero and finite."""
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
        B, S, H, D = 1, 16, 4, 64
        x = torch.randn(
            B, S, H, D, device="cuda", dtype=torch.bfloat16, requires_grad=True
        )
        w = torch.randn(D, device="cuda", dtype=torch.bfloat16, requires_grad=True)
        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
        y = fused_rms_norm_rope(x, w, cos, sin, eps=1e-6)
        y.sum().backward()
        assert x.grad is not None, "x.grad is None"
        assert w.grad is not None, "w.grad is None"
        assert x.grad.isfinite().all(), "x.grad has non-finite values"
        assert w.grad.isfinite().all(), "w.grad has non-finite values"
        assert x.grad.abs().sum() > 0, "x.grad is all zeros"
        assert w.grad.abs().sum() > 0, "w.grad is all zeros"
 class TestFusedRMSNormNoScale:
    """Tests for v_norm (RMSNorm without learnable scale)."""
    def test_forward_matches_reference(self, shapes, dtype):
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_noscale
        B, S, H, D = shapes
        eps = 1e-6
        x = torch.randn(B, S, H, D, device="cuda", dtype=dtype)
        y_ref = _reference_norm_noscale(x.clone(), eps)
        y_fused = fused_rms_norm_noscale(x.clone(), eps=eps)
        cos_sim = torch.nn.functional.cosine_similarity(
            y_ref.flatten().float(), y_fused.flatten().float(), dim=0
        )
        assert cos_sim > 0.999, f"v_norm cosine_sim={cos_sim:.6f}, expected > 0.999"
    def test_backward_flows(self):
        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_noscale
        x = torch.randn(
            1, 16, 4, 64, device="cuda", dtype=torch.bfloat16, requires_grad=True
        )
        y = fused_rms_norm_noscale(x, eps=1e-6)
        y.sum().backward()
        assert x.grad is not None
        assert x.grad.isfinite().all()
        assert x.grad.abs().sum() > 0