chunk to prevent overflows in kernel

lint and additional train metric checks for kd
more optims
2025-02-26 04:44:24 -05:00 · 2025-02-26 03:19:42 -05:00 · 2025-02-26 01:49:47 -05:00 · 2025-02-26 01:17:50 -05:00 · 2025-02-25 23:34:27 -05:00 · 2025-02-25 07:29:46 -05:00
8 changed files with 1612 additions and 5 deletions
--- a/src/axolotl/integrations/kd/topk_logprob/bench_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/bench_kl.py
@@ -0,0 +1,391 @@
+"""
+benchmark utility helper for benchmarking the KL divergence triton kernel
+"""
+import gc
+import time
+
+import torch
+from torch.utils.benchmark import Timer
+
+from axolotl.integrations.kd.topk_logprob.forward_kl import loss as eager_loss
+from axolotl.integrations.kd.topk_logprob.forward_kl_triton import loss as triton_loss
+
+
+# pylint: disable=cell-var-from-loop
+def benchmark_kl_div_loss_with_backward():
+    # Test configurations
+    batch_sizes = [1, 4]
+    seq_lens = [64, 512, 2048, 4096, 8192]
+    vocab_size = 32000
+    top_k = 64
+
+    # Store results
+    results = []
+
+    # Run benchmarks
+    for batch_size in batch_sizes:
+        for seq_len in seq_lens:
+            # Generate random test data
+            torch.manual_seed(42)
+
+            # Create tensors with gradients
+            student_logits = torch.randn(
+                batch_size, seq_len, vocab_size, device="cuda", requires_grad=True
+            )
+            # pylint: disable=duplicate-code
+            target_token_ids = torch.randint(
+                0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
+            )
+            target_logprobs_raw = torch.randn(batch_size, seq_len, top_k, device="cuda")
+            target_logprobs = torch.log_softmax(target_logprobs_raw, dim=-1)
+            target_mask = torch.randint(
+                0, 2, (batch_size, seq_len, top_k), device="cuda"
+            ).float()
+
+            # Clone student_logits for the two implementations
+            student_logits_ref = student_logits.clone().detach().requires_grad_(True)
+            student_logits_triton = student_logits.clone().detach().requires_grad_(True)
+
+            # Define functions for timing that include both forward and backward passes
+            def run_reference():
+                # Forward pass
+                loss_ref = eager_loss(
+                    student_logits_ref, target_token_ids, target_logprobs, target_mask
+                )
+                # Backward pass
+                loss_ref.backward()
+
+            def run_triton():
+                # Forward pass
+                # pylint: disable=duplicate-code
+                loss_triton = triton_loss(
+                    student_logits_triton,
+                    target_token_ids,
+                    target_logprobs,
+                    target_mask,
+                )
+                # Backward pass
+                loss_triton.backward()
+
+            # Benchmark reference implementation (forward + backward)
+            t0 = Timer(
+                stmt="run_reference()",
+                globals={
+                    "run_reference": run_reference,
+                },
+            )
+            # Reset gradients before timing
+            student_logits_ref.grad = None
+            ref_time = t0.timeit(10).median * 1000  # Convert to ms
+
+            # Benchmark Triton implementation (forward + backward)
+            t1 = Timer(
+                stmt="run_triton()",
+                globals={
+                    "run_triton": run_triton,
+                },
+            )
+            # Reset gradients before timing
+            student_logits_triton.grad = None
+            triton_time = t1.timeit(10).median * 1000  # Convert to ms
+
+            # Compute speedup
+            speedup = ref_time / triton_time if triton_time > 0 else float("inf")
+
+            # Store results
+            results.append(
+                {
+                    "batch_size": batch_size,
+                    "seq_len": seq_len,
+                    "reference_time_ms": ref_time,
+                    "triton_time_ms": triton_time,
+                    "speedup": speedup,
+                }
+            )
+
+            print(f"Batch size: {batch_size}, Seq len: {seq_len}")
+            print(f"  Reference time (fwd+bwd): {ref_time:.2f} ms")
+            print(f"  Triton time (fwd+bwd): {triton_time:.2f} ms")
+            print(f"  Speedup: {speedup:.2f}x")
+
+    return results
+
+
+def benchmark_forward_backward_separately():
+    """
+    Benchmark forward and backward passes separately to identify where the speedup comes from.
+    """
+    # Test configurations
+    batch_sizes = [1, 4, 8]
+    seq_lens = [64, 512, 2048]
+    vocab_size = 32000
+    top_k = 64
+
+    # Store results
+    detailed_results = []
+
+    # Run benchmarks
+    for batch_size in batch_sizes:
+        for seq_len in seq_lens:
+            # Generate random test data
+            torch.manual_seed(42)
+
+            # Create tensors with gradients
+            student_logits = torch.randn(
+                batch_size, seq_len, vocab_size, device="cuda", requires_grad=True
+            )
+            # pylint: disable=duplicate-code
+            target_token_ids = torch.randint(
+                0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
+            )
+            target_logprobs_raw = torch.randn(batch_size, seq_len, top_k, device="cuda")
+            target_logprobs = torch.log_softmax(target_logprobs_raw, dim=-1)
+            target_mask = torch.randint(
+                0, 2, (batch_size, seq_len, top_k), device="cuda"
+            ).float()
+
+            # Clone student_logits for the two implementations
+            student_logits_ref = student_logits.clone().detach().requires_grad_(True)
+            student_logits_triton = student_logits.clone().detach().requires_grad_(True)
+
+            # Forward-only reference
+            def run_reference_forward():
+                with torch.no_grad():
+                    return eager_loss(
+                        student_logits_ref,
+                        target_token_ids,
+                        target_logprobs,
+                        target_mask,
+                    )
+
+            # Forward-only triton
+            def run_triton_forward():
+                with torch.no_grad():
+                    return triton_loss(
+                        student_logits_triton,
+                        target_token_ids,
+                        target_logprobs,
+                        target_mask,
+                    )
+
+            # Benchmark forward pass only
+
+            t0_fwd = Timer(
+                stmt="run_reference_forward()",
+                globals={
+                    "run_reference_forward": run_reference_forward,
+                },
+            )
+            ref_fwd_time = t0_fwd.timeit(10).median * 1000  # Convert to ms
+
+            t1_fwd = Timer(
+                stmt="run_triton_forward()",
+                globals={
+                    "run_triton_forward": run_triton_forward,
+                },
+            )
+            triton_fwd_time = t1_fwd.timeit(10).median * 1000  # Convert to ms
+
+            # Pre-compute losses for backward pass benchmarking
+            loss_ref = eager_loss(
+                student_logits_ref, target_token_ids, target_logprobs, target_mask
+            )
+            loss_triton = triton_loss(
+                student_logits_triton, target_token_ids, target_logprobs, target_mask
+            )
+
+            # Backward-only reference
+            def run_reference_backward():
+                student_logits_ref.grad = None
+                loss_ref.backward()
+
+            # Backward-only triton
+            def run_triton_backward():
+                student_logits_triton.grad = None
+                loss_triton.backward()
+
+            # Benchmark backward pass only
+            t0_bwd = Timer(
+                stmt="run_reference_backward()",
+                globals={
+                    "run_reference_backward": run_reference_backward,
+                },
+            )
+            ref_bwd_time = t0_bwd.timeit(10).median * 1000  # Convert to ms
+
+            t1_bwd = Timer(
+                stmt="run_triton_backward()",
+                globals={
+                    "run_triton_backward": run_triton_backward,
+                },
+            )
+            triton_bwd_time = t1_bwd.timeit(10).median * 1000  # Convert to ms
+
+            # Compute speedups
+            fwd_speedup = (
+                ref_fwd_time / triton_fwd_time if triton_fwd_time > 0 else float("inf")
+            )
+            bwd_speedup = (
+                ref_bwd_time / triton_bwd_time if triton_bwd_time > 0 else float("inf")
+            )
+            total_ref_time = ref_fwd_time + ref_bwd_time
+            total_triton_time = triton_fwd_time + triton_bwd_time
+            total_speedup = (
+                total_ref_time / total_triton_time
+                if total_triton_time > 0
+                else float("inf")
+            )
+
+            # Store results
+            detailed_results.append(
+                {
+                    "batch_size": batch_size,
+                    "seq_len": seq_len,
+                    "ref_forward_ms": ref_fwd_time,
+                    "triton_forward_ms": triton_fwd_time,
+                    "forward_speedup": fwd_speedup,
+                    "ref_backward_ms": ref_bwd_time,
+                    "triton_backward_ms": triton_bwd_time,
+                    "backward_speedup": bwd_speedup,
+                    "total_ref_ms": total_ref_time,
+                    "total_triton_ms": total_triton_time,
+                    "total_speedup": total_speedup,
+                }
+            )
+
+            print(f"Batch size: {batch_size}, Seq len: {seq_len}")
+            print(
+                f"  Forward: Reference={ref_fwd_time:.2f}ms, Triton={triton_fwd_time:.2f}ms, Speedup={fwd_speedup:.2f}x"
+            )
+            print(
+                f"  Backward: Reference={ref_bwd_time:.2f}ms, Triton={triton_bwd_time:.2f}ms, Speedup={bwd_speedup:.2f}x"
+            )
+            print(
+                f"  Total: Reference={total_ref_time:.2f}ms, Triton={total_triton_time:.2f}ms, Speedup={total_speedup:.2f}x"
+            )
+
+    return detailed_results
+
+
+def benchmark_memory_usage_with_backward():
+    # Test configurations
+    batch_sizes = [1, 2]
+    seq_len = 8192
+    vocab_size = 128000
+    top_k = 64
+
+    # Store results
+    mem_results = []
+
+    # Run benchmarks
+    for batch_size in batch_sizes:
+        # Generate random test data
+        torch.manual_seed(42)
+        student_logits = torch.randn(
+            batch_size, seq_len, vocab_size, device="cuda", requires_grad=True
+        )
+        target_token_ids = torch.randint(
+            0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
+        )
+        target_logprobs_raw = torch.randn(batch_size, seq_len, top_k, device="cuda")
+        target_logprobs = torch.log_softmax(target_logprobs_raw, dim=-1)
+        target_mask = torch.randint(
+            0, 2, (batch_size, seq_len, top_k), device="cuda"
+        ).float()
+
+        # Clone student_logits for the implementations
+        student_logits_ref = student_logits.clone().detach().requires_grad_(True)
+        student_logits_triton = student_logits.clone().detach().requires_grad_(True)
+
+        # Measure PyTorch memory usage (forward + backward)
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+        loss_ref = eager_loss(
+            student_logits_ref, target_token_ids, target_logprobs, target_mask
+        )
+        loss_ref.backward()
+        torch.cuda.synchronize()
+        pytorch_mem = torch.cuda.max_memory_allocated() / (1024**2)  # Convert to MB
+
+        # Measure Triton memory usage (forward + backward)
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+        loss_triton = triton_loss(
+            student_logits_triton, target_token_ids, target_logprobs, target_mask
+        )
+        loss_triton.backward()
+        torch.cuda.synchronize()
+        triton_mem = torch.cuda.max_memory_allocated() / (1024**2)  # Convert to MB
+
+        # Measure Triton memory usage with different chunk sizes (forward + backward)
+        for n_chunks in [1, 2, 4, 8]:
+            student_logits_chunk = student_logits.clone().detach().requires_grad_(True)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+            loss_chunk = triton_loss(
+                student_logits_chunk,
+                target_token_ids,
+                target_logprobs,
+                target_mask,
+            )
+            loss_chunk.backward()
+            torch.cuda.synchronize()
+            chunk_mem = torch.cuda.max_memory_allocated() / (1024**2)  # Convert to MB
+
+            mem_results.append(
+                {
+                    "batch_size": batch_size,
+                    "implementation": f"Triton (chunks={n_chunks})",
+                    "memory_mb": chunk_mem,
+                }
+            )
+
+        # Store results
+        mem_results.append(
+            {
+                "batch_size": batch_size,
+                "implementation": "PyTorch",
+                "memory_mb": pytorch_mem,
+            }
+        )
+
+        mem_results.append(
+            {
+                "batch_size": batch_size,
+                "implementation": "Triton (default)",
+                "memory_mb": triton_mem,
+            }
+        )
+
+        print(f"Batch size: {batch_size} (with backward pass)")
+        print(f"  PyTorch memory: {pytorch_mem:.2f} MB")
+        print(f"  Triton memory: {triton_mem:.2f} MB")
+        print(f"  Memory reduction: {(1 - triton_mem/pytorch_mem)*100:.2f}%")
+
+    return mem_results
+
+
+def main():
+    print("Running benchmarks with forward and backward passes...")
+    benchmark_kl_div_loss_with_backward()
+    clean()
+
+    print("\nRunning detailed forward/backward benchmarks...")
+    # benchmark_forward_backward_separately()
+    # clean()
+
+    print("\nRunning memory usage benchmarks with backward passes...")
+    benchmark_memory_usage_with_backward()
+    clean()
+
+
+def clean():
+    for _ in range(5):
+        gc.collect()
+        torch.cuda.empty_cache()
+        time.sleep(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl_triton.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl_triton.py
@@ -0,0 +1,750 @@
+"""
+Optimized Triton kernel for KL divergence loss between teacher and student models.
+"""
+# pylint: disable=invalid-name,unused-argument
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def fused_logsumexp_logprobs_kernel(
+    student_logits_ptr,  # Input logits in original dtype
+    student_logprobs_ptr,  # Output logprobs (float32)
+    token_ids_ptr,  # Token IDs for top-k
+    B,
+    S,
+    V,
+    K,  # batch size, seq len, vocab size, top-k
+    temperature,
+    stride_l_b,
+    stride_l_s,
+    stride_l_v,
+    stride_lp_b,
+    stride_lp_s,
+    stride_lp_k,
+    stride_t_b,
+    stride_t_s,
+    stride_t_k,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Fused kernel that computes logsumexp and logprobs for topk tokens.
+    All computations are done in float32 for numerical stability.
+    """
+    # Program ID
+    pid = tl.program_id(0)
+    batch_idx = pid // S
+    seq_idx = pid % S
+
+    # Bounds check
+    if batch_idx >= B or seq_idx >= S:
+        return
+
+    # Compute logsumexp over the vocabulary
+    max_val = -float("inf")
+
+    # Phase 1: Find max value across vocabulary
+    for v_offset in range(0, V, BLOCK_SIZE):
+        # Create block indices and mask
+        block_size = min(BLOCK_SIZE, V - v_offset)
+        block_idx = tl.arange(0, BLOCK_SIZE)
+        mask = block_idx < block_size
+
+        # Load logits block and convert to float32 in-place
+        ptrs = (
+            student_logits_ptr
+            + batch_idx * stride_l_b
+            + seq_idx * stride_l_s
+            + (v_offset + block_idx) * stride_l_v
+        )
+        block_logits = tl.load(ptrs, mask=mask, other=-float("inf")).to(tl.float32)
+
+        # Apply temperature scaling if needed
+        if temperature != 1.0:
+            block_logits = block_logits / temperature
+
+        # Update max value
+        block_max = tl.max(block_logits, axis=0)
+        max_val = tl.maximum(max_val, block_max)
+
+    # Phase 2: Compute sum of exp(logits - max_val)
+    sum_exp = 0.0
+
+    for v_offset in range(0, V, BLOCK_SIZE):
+        # Create block indices and mask
+        block_size = min(BLOCK_SIZE, V - v_offset)
+        block_idx = tl.arange(0, BLOCK_SIZE)
+        mask = block_idx < block_size
+
+        # Load logits block and convert to float32 in-place
+        ptrs = (
+            student_logits_ptr
+            + batch_idx * stride_l_b
+            + seq_idx * stride_l_s
+            + (v_offset + block_idx) * stride_l_v
+        )
+        block_logits = tl.load(ptrs, mask=mask, other=-float("inf")).to(tl.float32)
+
+        # Apply temperature scaling if needed
+        if temperature != 1.0:
+            block_logits = block_logits / temperature
+
+        # Compute exp(logits - max_val) and add to sum
+        block_exp = tl.exp(block_logits - max_val)
+        sum_exp += tl.sum(block_exp * mask, axis=0)
+
+    # Compute final logsumexp
+    logsumexp = max_val + tl.log(sum_exp)
+
+    # Phase 3: Compute and store logprobs for the top-k tokens
+    token_ids_base = token_ids_ptr + batch_idx * stride_t_b + seq_idx * stride_t_s
+    logprobs_base = (
+        student_logprobs_ptr + batch_idx * stride_lp_b + seq_idx * stride_lp_s
+    )
+
+    for k in range(K):
+        # Load token ID for position k
+        token_id = tl.load(token_ids_base + k * stride_t_k)
+
+        # Load the corresponding logit and convert to float32
+        token_logit_ptr = (
+            student_logits_ptr
+            + batch_idx * stride_l_b
+            + seq_idx * stride_l_s
+            + token_id * stride_l_v
+        )
+        token_logit = tl.load(token_logit_ptr).to(tl.float32)
+
+        # Apply temperature scaling if needed
+        if temperature != 1.0:
+            token_logit = token_logit / temperature
+
+        # Compute logprob directly: logit - logsumexp
+        token_logprob = token_logit - logsumexp
+
+        # Store the result
+        tl.store(logprobs_base + k * stride_lp_k, token_logprob)
+
+
+@triton.jit
+def grad_softmax_kernel(
+    grad_student_logits_ptr,
+    target_token_ids_ptr,
+    teacher_probs_ptr,
+    student_probs_ptr,
+    mask_ptr,
+    B,
+    S,
+    V,
+    K,  # batch size, seq len, vocab size, top-k
+    scale,
+    stride_gl_b,
+    stride_gl_s,
+    stride_gl_v,
+    stride_t_b,
+    stride_t_s,
+    stride_t_k,
+    stride_p_b,
+    stride_p_s,
+    stride_p_k,
+    stride_sp_b,
+    stride_sp_s,
+    stride_sp_k,
+    stride_m_b,
+    stride_m_s,
+    stride_m_k,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Program ID
+    pid = tl.program_id(0)
+    batch_idx = pid // S
+    seq_idx = pid % S
+
+    # Bounds check
+    if batch_idx >= B or seq_idx >= S:
+        return
+
+    # Base pointers for this (batch, seq) pair
+    grad_logits_base = (
+        grad_student_logits_ptr + batch_idx * stride_gl_b + seq_idx * stride_gl_s
+    )
+    token_ids_base = (
+        target_token_ids_ptr + batch_idx * stride_t_b + seq_idx * stride_t_s
+    )
+    teacher_probs_base = (
+        teacher_probs_ptr + batch_idx * stride_p_b + seq_idx * stride_p_s
+    )
+    student_probs_base = (
+        student_probs_ptr + batch_idx * stride_sp_b + seq_idx * stride_sp_s
+    )
+    mask_base = mask_ptr + batch_idx * stride_m_b + seq_idx * stride_m_s
+
+    # Process each teacher probability one at a time, computing all gradients for it
+    for k in range(0, K):
+        # Load data for current position k
+        teacher_prob = tl.load(teacher_probs_base + k * stride_p_k)
+        student_prob_k = tl.load(student_probs_base + k * stride_sp_k)
+        mask_val = tl.load(mask_base + k * stride_m_k)
+
+        # Precompute the self-influence term (multiplied by scale)
+        self_term = teacher_prob * (1.0 - student_prob_k) * scale
+
+        # Calculate gradient contributions for all positions j
+        for j in range(0, K):
+            token_id_j = tl.load(token_ids_base + j * stride_t_k)
+            student_prob_j = tl.load(student_probs_base + j * stride_sp_k)
+            mask_j = tl.load(mask_base + j * stride_m_k)
+
+            # Calculate the masking factor
+            combined_mask = mask_val * mask_j
+
+            # Determine if this is a diagonal or off-diagonal term
+            is_k_equals_j = tl.where(k == j, 1.0, 0.0)
+
+            # Compute the gradient contribution
+            # For diagonal (k==j): -teacher_prob * (1-student_prob_k) * scale * mask
+            # For off-diagonal: -(-teacher_prob * student_prob_j) * scale * mask
+            grad_contribution = (
+                -(
+                    self_term * is_k_equals_j
+                    - teacher_prob * student_prob_j * scale * (1.0 - is_k_equals_j)
+                )
+                * combined_mask
+            )
+
+            # Atomically update the gradient for this token
+            tl.atomic_add(
+                grad_logits_base + token_id_j * stride_gl_v, grad_contribution
+            )
+
+
+@triton.jit
+def grad_topk_softmax_kernel(
+    grad_student_logits_ptr,
+    student_logits_ptr,
+    target_token_ids_ptr,
+    teacher_probs_ptr,
+    student_probs_ptr,
+    mask_ptr,
+    B,
+    S,
+    V,
+    K,  # batch size, seq len, vocab size, top-k
+    scale,
+    stride_gl_b,
+    stride_gl_s,
+    stride_gl_v,
+    stride_l_b,
+    stride_l_s,
+    stride_l_v,
+    stride_t_b,
+    stride_t_s,
+    stride_t_k,
+    stride_p_b,
+    stride_p_s,
+    stride_p_k,
+    stride_sp_b,
+    stride_sp_s,
+    stride_sp_k,
+    stride_m_b,
+    stride_m_s,
+    stride_m_k,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Program ID
+    pid = tl.program_id(0)
+    batch_idx = pid // S
+    seq_idx = pid % S
+
+    # Bounds check
+    if batch_idx >= B or seq_idx >= S:
+        return
+
+    # Base pointers for this (batch, seq) pair
+    grad_logits_base = (
+        grad_student_logits_ptr + batch_idx * stride_gl_b + seq_idx * stride_gl_s
+    )
+    # logits_base = student_logits_ptr + batch_idx * stride_l_b + seq_idx * stride_l_s
+    token_ids_base = (
+        target_token_ids_ptr + batch_idx * stride_t_b + seq_idx * stride_t_s
+    )
+    teacher_probs_base = (
+        teacher_probs_ptr + batch_idx * stride_p_b + seq_idx * stride_p_s
+    )
+    student_probs_base = (
+        student_probs_ptr + batch_idx * stride_sp_b + seq_idx * stride_sp_s
+    )
+    mask_base = mask_ptr + batch_idx * stride_m_b + seq_idx * stride_m_s
+
+    # Load all token IDs, probs and masks for this position
+    token_ids = tl.zeros([K], dtype=tl.int32)
+    teacher_probs = tl.zeros([K], dtype=tl.float32)
+    student_probs = tl.zeros([K], dtype=tl.float32)
+    masks = tl.zeros([K], dtype=tl.float32)
+
+    for k in range(K):
+        token_ids[k] = tl.load(token_ids_base + k * stride_t_k)
+        teacher_probs[k] = tl.load(teacher_probs_base + k * stride_p_k)
+        student_probs[k] = tl.load(student_probs_base + k * stride_sp_k)
+        masks[k] = tl.load(mask_base + k * stride_m_k)
+
+    # Process gradients for all tokens in this position
+    for k in range(K):
+        # token_id = token_ids[k]
+        mask_k = masks[k]
+
+        # Skip computation if mask is zero by multiplying gradient by mask
+        for j in range(K):
+            other_token_id = token_ids[j]
+            mask_j = masks[j]
+            combined_mask = mask_k * mask_j
+
+            # Compute gradient differently for diagonal vs off-diagonal entries
+            # Using * 1.0 to convert boolean to float
+            is_diagonal = tl.where(j == k, 1.0, 0.0)
+
+            # Self influence: gradient = teacher_prob * (1 - student_prob)
+            self_grad = teacher_probs[k] * (1.0 - student_probs[k]) * is_diagonal
+
+            # Cross influence: gradient = -teacher_prob[k] * student_prob[j]
+            cross_grad = -teacher_probs[k] * student_probs[j] * (1.0 - is_diagonal)
+
+            # Combined gradient scaled by mask
+            grad_val = (self_grad + cross_grad) * scale * combined_mask
+
+            tl.atomic_add(grad_logits_base + other_token_id * stride_gl_v, grad_val)
+
+
+# Triton-accelerated implementation of KL divergence loss for top-k tokens
+# Chunking helper functions for handling long sequences
+def chunk_tensor(
+    tensor: torch.Tensor, max_seq_len: int
+) -> Tuple[torch.Tensor, Optional[int]]:
+    """Split a tensor along sequence dimension if needed."""
+    _, seq_len, *__ = tensor.shape
+
+    if seq_len <= max_seq_len:
+        return tensor, None
+
+    num_chunks = (seq_len + max_seq_len - 1) // max_seq_len
+    chunks = []
+
+    for i in range(num_chunks):
+        start_idx = i * max_seq_len
+        end_idx = min((i + 1) * max_seq_len, seq_len)
+        chunks.append(tensor[:, start_idx:end_idx, ...])
+
+    return chunks, num_chunks
+
+
+def merge_chunks(chunks: list, original_shape: torch.Size):
+    """Merge chunks back into a single tensor with original shape."""
+    return torch.cat(chunks, dim=1)
+
+
+# Triton-accelerated implementation of KL divergence loss for top-k tokens
+class TopKKLDivergence(torch.autograd.Function):
+    """
+    Autograd function for KL divergence loss between top-k logprobs
+    with support for chunking to handle very long sequences.
+    """
+
+    # Max sequence length to process in a single kernel launch
+    # This is a tunable parameter that might need adjustment based on GPU memory
+    MAX_SEQ_LEN = 8192
+
+    @staticmethod
+    def forward(
+        ctx,
+        student_logits,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        num_items_in_batch=-1,
+        kd_temperature=1.0,
+        top_k_before_softmax=0,
+    ):
+        """
+        Forward pass for KL divergence loss between top-k logprobs with chunking.
+        """
+        # Only convert target_logprobs to float, leave student_logits as is
+        target_logprobs = target_logprobs.float()
+
+        # Get dimensions
+        batch_size, _, vocab_size = student_logits.shape
+        _, teacher_seq_len, top_k = target_token_ids.shape
+
+        # Slice student logits to match teacher sequence length
+        student_logits_for_kd = student_logits[:, :teacher_seq_len, :]
+
+        # Store original values for backward pass
+        ctx.original_seq_len = teacher_seq_len
+        ctx.original_dtype = student_logits.dtype
+
+        # Apply chunking for long sequences
+        if teacher_seq_len > TopKKLDivergence.MAX_SEQ_LEN:
+            # Chunk the inputs
+            student_logits_chunks, num_chunks = chunk_tensor(
+                student_logits_for_kd, TopKKLDivergence.MAX_SEQ_LEN
+            )
+            target_token_ids_chunks, _ = chunk_tensor(
+                target_token_ids, TopKKLDivergence.MAX_SEQ_LEN
+            )
+            # target_logprobs_chunks, _ = chunk_tensor(
+            #     target_logprobs, TopKKLDivergence.MAX_SEQ_LEN
+            # )
+            # target_mask_chunks, _ = chunk_tensor(
+            #     target_mask, TopKKLDivergence.MAX_SEQ_LEN
+            # )
+
+            # Process each chunk
+            student_logprobs_chunks = []
+            student_probs_chunks = []
+
+            for i in range(num_chunks):
+                chunk_logits = student_logits_chunks[i]
+                chunk_token_ids = target_token_ids_chunks[i]
+                chunk_seq_len = chunk_logits.shape[1]
+
+                if top_k_before_softmax:
+                    # Apply temperature to student logits
+                    if kd_temperature != 1.0:
+                        chunk_logits = chunk_logits / kd_temperature
+
+                    # Gather student logits for top-k tokens
+                    chunk_logits_topk = torch.gather(
+                        chunk_logits, dim=-1, index=chunk_token_ids
+                    )
+
+                    # Compute softmax over gathered logits
+                    chunk_logprobs_topk = torch.log_softmax(chunk_logits_topk, dim=-1)
+                    chunk_probs_topk = torch.exp(chunk_logprobs_topk)
+                else:
+                    # Allocate output tensor for logprobs directly (always in float32)
+                    chunk_logprobs_topk = torch.empty(
+                        (batch_size, chunk_seq_len, top_k),
+                        dtype=torch.float32,
+                        device=chunk_logits.device,
+                    )
+
+                    # Launch fused kernel directly
+                    grid = (batch_size * chunk_seq_len,)
+                    fused_logsumexp_logprobs_kernel[grid](
+                        chunk_logits.contiguous(),
+                        chunk_logprobs_topk,
+                        chunk_token_ids.contiguous(),
+                        batch_size,
+                        chunk_seq_len,
+                        vocab_size,
+                        top_k,
+                        kd_temperature,
+                        chunk_logits.stride(0),
+                        chunk_logits.stride(1),
+                        chunk_logits.stride(2),
+                        chunk_logprobs_topk.stride(0),
+                        chunk_logprobs_topk.stride(1),
+                        chunk_logprobs_topk.stride(2),
+                        chunk_token_ids.stride(0),
+                        chunk_token_ids.stride(1),
+                        chunk_token_ids.stride(2),
+                        min(1024, triton.next_power_of_2(vocab_size)),
+                    )
+
+                    # Calculate probs from logprobs
+                    chunk_probs_topk = torch.exp(chunk_logprobs_topk)
+
+                # Store results
+                student_logprobs_chunks.append(chunk_logprobs_topk)
+                student_probs_chunks.append(chunk_probs_topk)
+
+            # Merge results
+            student_logprobs_topk = torch.cat(student_logprobs_chunks, dim=1)
+            student_probs_topk = torch.cat(student_probs_chunks, dim=1)
+
+            # Save chunking info for backward pass
+            ctx.used_chunking = True
+            ctx.num_chunks = num_chunks
+
+        else:
+            # Original code path for shorter sequences
+            if top_k_before_softmax:
+                # Apply temperature to student logits
+                if kd_temperature != 1.0:
+                    student_logits_for_kd = student_logits_for_kd / kd_temperature
+
+                # Gather student logits for top-k tokens
+                student_logits_topk = torch.gather(
+                    student_logits_for_kd, dim=-1, index=target_token_ids
+                )
+
+                # Compute softmax over gathered logits
+                student_logprobs_topk = torch.log_softmax(student_logits_topk, dim=-1)
+                student_probs_topk = torch.exp(student_logprobs_topk)
+            else:
+                # Allocate output tensor for logprobs directly (always in float32)
+                student_logprobs_topk = torch.empty(
+                    (batch_size, teacher_seq_len, top_k),
+                    dtype=torch.float32,
+                    device=student_logits.device,
+                )
+
+                # Launch fused kernel directly
+                grid = (batch_size * teacher_seq_len,)
+                fused_logsumexp_logprobs_kernel[grid](
+                    student_logits_for_kd.contiguous(),
+                    student_logprobs_topk,
+                    target_token_ids.contiguous(),
+                    batch_size,
+                    teacher_seq_len,
+                    vocab_size,
+                    top_k,
+                    kd_temperature,
+                    student_logits_for_kd.stride(0),
+                    student_logits_for_kd.stride(1),
+                    student_logits_for_kd.stride(2),
+                    student_logprobs_topk.stride(0),
+                    student_logprobs_topk.stride(1),
+                    student_logprobs_topk.stride(2),
+                    target_token_ids.stride(0),
+                    target_token_ids.stride(1),
+                    target_token_ids.stride(2),
+                    min(1024, triton.next_power_of_2(vocab_size)),
+                )
+
+                # Calculate probs from logprobs
+                student_probs_topk = torch.exp(student_logprobs_topk)
+
+            # No chunking used
+            ctx.used_chunking = False
+
+        # Save tensors for backward pass
+        ctx.save_for_backward(
+            student_logits_for_kd,
+            target_token_ids,
+            target_logprobs,
+            target_mask,
+            student_probs_topk,
+        )
+        ctx.kd_temperature = kd_temperature
+        ctx.top_k_before_softmax = top_k_before_softmax
+        ctx.num_items_in_batch = num_items_in_batch
+
+        # Convert mask to boolean
+        valid_mask = target_mask.bool()
+
+        # Extract valid tokens only - this is where the error was happening
+        # Use cloned contiguous tensors and explicit indexing for safety
+        student_logprobs_flat = student_logprobs_topk.view(-1, top_k)
+        target_logprobs_flat = target_logprobs.view(-1, top_k)
+        valid_mask_flat = valid_mask.view(-1, top_k)
+
+        # Gather valid indices explicitly to avoid illegal memory access
+        valid_indices = torch.nonzero(valid_mask_flat.view(-1)).squeeze(-1)
+        student_logprobs_valid = torch.index_select(
+            student_logprobs_flat.view(-1), 0, valid_indices
+        )
+        target_logprobs_valid = torch.index_select(
+            target_logprobs_flat.view(-1), 0, valid_indices
+        )
+
+        # Convert teacher logprobs to probabilities
+        teacher_probs_valid = torch.exp(target_logprobs_valid)
+
+        # Compute KL divergence loss
+        token_losses = teacher_probs_valid * (
+            target_logprobs_valid - student_logprobs_valid
+        )
+        kd_loss = token_losses.sum()
+
+        # Apply temperature scaling
+        # pylint: disable=duplicate-code
+        if kd_temperature != 1.0:
+            kd_loss = kd_loss * (kd_temperature**2)
+
+        # Normalize by number of items or valid tokens
+        if num_items_in_batch > 0:
+            kd_loss = kd_loss / float(num_items_in_batch)
+        else:
+            num_valid_tokens = valid_indices.numel()
+            kd_loss = kd_loss / float(num_valid_tokens if num_valid_tokens > 0 else 1)
+
+        return kd_loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Optimized backward pass for KL divergence loss with proper dtype handling and chunking.
+        """
+        (
+            student_logits,
+            target_token_ids,
+            target_logprobs,
+            target_mask,
+            student_probs,
+        ) = ctx.saved_tensors
+        kd_temperature = ctx.kd_temperature
+        num_items_in_batch = ctx.num_items_in_batch
+        original_dtype = ctx.original_dtype
+
+        # Get dimensions
+        batch_size, _, vocab_size = student_logits.shape
+        _, teacher_seq_len, top_k = target_token_ids.shape
+
+        # Initialize gradient tensor in float32 to support atomic operations
+        grad_student_logits = torch.zeros_like(student_logits, dtype=torch.float32)
+
+        # Compute scaling factor
+        scale = grad_output.item()
+
+        # Apply temperature scaling from forward pass
+        if kd_temperature != 1.0:
+            scale = scale * (kd_temperature**2)
+
+        # Normalize by number of items or valid tokens
+        if num_items_in_batch > 0:
+            scale = scale / float(num_items_in_batch)
+        else:
+            scale = scale / float(target_mask.sum().item())
+
+        # Apply chain rule for temperature scaling (1/temperature)
+        if kd_temperature != 1.0:
+            scale = scale / kd_temperature
+
+        # Convert teacher logprobs to probabilities
+        teacher_probs = torch.exp(target_logprobs)
+
+        # Use chunking for the backward pass if used in forward
+        if getattr(ctx, "used_chunking", False):
+            num_chunks = ctx.num_chunks
+            max_seq = TopKKLDivergence.MAX_SEQ_LEN
+
+            # Process each chunk
+            for i in range(num_chunks):
+                start_idx = i * max_seq
+                end_idx = min((i + 1) * max_seq, teacher_seq_len)
+                chunk_len = end_idx - start_idx
+
+                # Get chunk slices
+                # student_logits_chunk = student_logits[:, start_idx:end_idx, :]
+                target_token_ids_chunk = target_token_ids[:, start_idx:end_idx, :]
+                teacher_probs_chunk = teacher_probs[:, start_idx:end_idx, :]
+                student_probs_chunk = student_probs[:, start_idx:end_idx, :]
+                target_mask_chunk = target_mask[:, start_idx:end_idx, :]
+                grad_student_logits_chunk = grad_student_logits[:, start_idx:end_idx, :]
+
+                # Launch gradient computation kernel for this chunk
+                grid = (batch_size * chunk_len,)
+                grad_softmax_kernel[grid](
+                    grad_student_logits_chunk.contiguous(),
+                    target_token_ids_chunk.contiguous(),
+                    teacher_probs_chunk.contiguous(),
+                    student_probs_chunk.contiguous(),
+                    target_mask_chunk.contiguous(),
+                    batch_size,
+                    chunk_len,
+                    vocab_size,
+                    top_k,
+                    scale,
+                    grad_student_logits_chunk.stride(0),
+                    grad_student_logits_chunk.stride(1),
+                    grad_student_logits_chunk.stride(2),
+                    target_token_ids_chunk.stride(0),
+                    target_token_ids_chunk.stride(1),
+                    target_token_ids_chunk.stride(2),
+                    teacher_probs_chunk.stride(0),
+                    teacher_probs_chunk.stride(1),
+                    teacher_probs_chunk.stride(2),
+                    student_probs_chunk.stride(0),
+                    student_probs_chunk.stride(1),
+                    student_probs_chunk.stride(2),
+                    target_mask_chunk.stride(0),
+                    target_mask_chunk.stride(1),
+                    target_mask_chunk.stride(2),
+                    min(1024, triton.next_power_of_2(top_k)),
+                )
+
+                # Update the gradient tensor (already in-place)
+        else:
+            # Original code path for shorter sequences
+            # Launch gradient computation kernel
+            grid = (batch_size * teacher_seq_len,)
+            grad_softmax_kernel[grid](
+                grad_student_logits.contiguous(),
+                target_token_ids.contiguous(),
+                teacher_probs.contiguous(),
+                student_probs.contiguous(),
+                target_mask.contiguous(),
+                batch_size,
+                teacher_seq_len,
+                vocab_size,
+                top_k,
+                scale,
+                grad_student_logits.stride(0),
+                grad_student_logits.stride(1),
+                grad_student_logits.stride(2),
+                target_token_ids.stride(0),
+                target_token_ids.stride(1),
+                target_token_ids.stride(2),
+                teacher_probs.stride(0),
+                teacher_probs.stride(1),
+                teacher_probs.stride(2),
+                student_probs.stride(0),
+                student_probs.stride(1),
+                student_probs.stride(2),
+                target_mask.stride(0),
+                target_mask.stride(1),
+                target_mask.stride(2),
+                min(1024, triton.next_power_of_2(top_k)),
+            )
+
+        # Convert gradient back to original dtype if needed
+        if original_dtype != torch.float32:
+            grad_student_logits = grad_student_logits.to(original_dtype)
+
+        # Return gradients for student_logits and None for other inputs
+        return grad_student_logits, None, None, None, None, None, None
+
+
+# Wrapper function for chunked computation
+def loss(
+    student_logits: torch.Tensor,
+    target_token_ids: torch.Tensor,
+    target_logprobs: torch.Tensor,
+    target_mask: torch.Tensor,
+    num_items_in_batch: int = -1,
+    kd_temperature: float = 1.0,
+    top_k_before_softmax: int = 0,
+    max_seq_len: Optional[int] = None,
+):
+    """
+    Triton-accelerated Memory-efficient KL divergence loss computation for knowledge distillation
+    with support for very long sequences.
+
+    Args:
+        student_logits: Student logits [B, seq_len, vocab_size]
+        target_token_ids: Teacher token IDs [B, seq_len, top_k]
+        target_logprobs: Teacher logprobs [B, seq_len, top_k]
+        target_mask: Token mask [B, seq_len, top_k]
+        num_items_in_batch: Number of items for normalization (-1 for auto)
+        kd_temperature: Temperature for KD
+        top_k_before_softmax: Flag for softmax application order
+        max_seq_len: Override default MAX_SEQ_LEN value for chunking
+    """
+    # Allow overriding the max sequence length
+    if max_seq_len is not None and max_seq_len > 0:
+        TopKKLDivergence.MAX_SEQ_LEN = max_seq_len
+
+    total_loss = TopKKLDivergence.apply(
+        student_logits,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        -1 if num_items_in_batch <= 0 else num_items_in_batch,
+        kd_temperature,
+        top_k_before_softmax,
+    )
+
+    return total_loss
--- a/src/axolotl/integrations/kd/topk_logprob/logsumexp.py
+++ b/src/axolotl/integrations/kd/topk_logprob/logsumexp.py
@@ -0,0 +1,67 @@
+"""
+Optimized Triton kernels for logsumexp
+"""
+# pylint: disable=invalid-name,unused-argument
+import triton
+import triton.language as tl
+
+
+# Helper function for computing logsumexp
+@triton.jit
+def logsumexp_kernel(
+    logits_ptr,
+    output_ptr,
+    B,
+    S,
+    V,  # batch size, seq len, vocab size
+    stride_b,
+    stride_s,
+    stride_v,
+    out_stride_b,
+    out_stride_s,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Program ID
+    # pylint: disable=duplicate-code
+    pid = tl.program_id(0)
+    batch_idx = pid // S
+    seq_idx = pid % S
+
+    # Bounds check
+    if batch_idx >= B or seq_idx >= S:
+        return
+
+    # Pointers
+    logits_base = logits_ptr + batch_idx * stride_b + seq_idx * stride_s
+
+    # Find maximum for numerical stability
+    max_val = -float("inf")
+    for v_offset in range(0, V, BLOCK_SIZE):
+        v_size = min(BLOCK_SIZE, V - v_offset)
+        mask = tl.arange(0, BLOCK_SIZE) < v_size
+
+        logits_block = tl.load(
+            logits_base + (v_offset + tl.arange(0, BLOCK_SIZE)) * stride_v,
+            mask=mask,
+            other=-float("inf"),
+        )
+        max_val = tl.maximum(max_val, tl.max(logits_block, axis=0))
+
+    # Compute sum of exp(logit - max_val)
+    sum_exp = 0.0
+    for v_offset in range(0, V, BLOCK_SIZE):
+        v_size = min(BLOCK_SIZE, V - v_offset)
+        mask = tl.arange(0, BLOCK_SIZE) < v_size
+
+        logits_block = tl.load(
+            logits_base + (v_offset + tl.arange(0, BLOCK_SIZE)) * stride_v,
+            mask=mask,
+            other=-float("inf"),
+        )
+        sum_exp += tl.sum(tl.exp(logits_block - max_val), axis=0)
+
+    # Compute logsumexp
+    result = max_val + tl.log(sum_exp)
+
+    # Store result
+    tl.store(output_ptr + batch_idx * out_stride_b + seq_idx * out_stride_s, result)
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -20,6 +20,7 @@ from axolotl.core.trainers.base import AxolotlTrainer

 from .topk_logprob.forward_kl import loss as topk_kd_loss
 from .topk_logprob.forward_kl import topk_kd_loss_with_zscore
+from .topk_logprob.forward_kl_triton import loss as topk_kd_loss_triton


 class AxolotlKDTrainer(AxolotlTrainer):
@@ -85,7 +86,12 @@ class AxolotlKDTrainer(AxolotlTrainer):
                num_items_in_batch=num_items_in_batch,
            )
        else:
-            loss_kd = topk_kd_loss(
+            loss_fn = (
+                topk_kd_loss
+                if self.args.kd_top_k_before_softmax
+                else topk_kd_loss_triton
+            )
+            loss_kd = loss_fn(
                shift_logits,
                target_token_ids_for_loss,
                target_logprobs_for_loss,
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -90,6 +90,12 @@ class TestKnowledgeDistillation:
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
+        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
+        )

    @pytest.mark.parametrize(
        "load_in_8bit",
@@ -121,3 +127,9 @@ class TestKnowledgeDistillation:
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
+        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
+        )
--- a/tests/e2e/integrations/test_kl_loss.py
+++ b/tests/e2e/integrations/test_kl_loss.py
@@ -0,0 +1,163 @@
+"""
+sanity checks on kl loss and gradients
+"""
+import torch
+
+# Import both implementations
+from axolotl.integrations.kd.topk_logprob.forward_kl import loss as eager_loss
+from axolotl.integrations.kd.topk_logprob.forward_kl_triton import loss as triton_loss
+
+
+def test_kl_loss_gradient():
+    """Test that the gradient of the Triton implementation matches the eager implementation."""
+
+    # Set the random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Create random inputs
+    batch_size = 2
+    seq_len = 3
+    vocab_size = 100
+    top_k = 5
+
+    # Generate random student logits
+    student_logits = torch.randn(
+        batch_size, seq_len, vocab_size, requires_grad=True, device="cuda"
+    )
+    student_logits_triton = student_logits.detach().clone().requires_grad_(True)
+
+    # Generate random target token IDs, ensuring they're valid indices
+    # pylint: disable=duplicate-code
+    target_token_ids = torch.randint(
+        0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
+    )
+
+    # Generate random target logprobs (before normalization)
+    target_logprobs_raw = torch.randn(batch_size, seq_len, top_k, device="cuda")
+
+    # Normalize the target logprobs to ensure they form a valid distribution
+    target_logprobs = torch.log_softmax(target_logprobs_raw, dim=-1)
+
+    # Create a random mask with some tokens masked out
+    target_mask = torch.randint(
+        0, 2, (batch_size, seq_len, top_k), device="cuda"
+    ).float()
+
+    # Additional parameters
+    num_items_in_batch = batch_size * seq_len
+    kd_temperature = 1.0
+    top_k_before_softmax = 0  # Test both modes
+
+    # Compute the loss and gradients with eager implementation
+    loss_eager = eager_loss(
+        student_logits,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        num_items_in_batch,
+        kd_temperature,
+        top_k_before_softmax,
+    )
+    loss_eager.backward()
+    grad_eager = student_logits.grad.clone()
+
+    # Reset gradients
+    student_logits.grad.zero_()
+
+    # Compute the loss and gradients with Triton implementation
+    loss_triton = triton_loss(
+        student_logits_triton,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        num_items_in_batch,
+        kd_temperature,
+        top_k_before_softmax,
+    )
+    loss_triton.backward()
+    grad_triton = student_logits_triton.grad.clone()
+
+    # Compare loss values
+    print(f"Eager loss: {loss_eager.item()}")
+    print(f"Triton loss: {loss_triton.item()}")
+    loss_diff = abs(loss_eager.item() - loss_triton.item())
+    print(f"Loss difference: {loss_diff}")
+    assert loss_diff < 1e-5, "Loss values differ significantly!"
+
+    # Compare gradients
+    grad_diff = (grad_eager - grad_triton).abs().max().item()
+    print(f"Max gradient difference: {grad_diff}")
+
+    # Print some sample gradients
+    sample_idx = (0, 0, 0)  # (batch, seq, vocab)
+    print(f"Sample eager gradient: {grad_eager[sample_idx].item()}")
+    print(f"Sample triton gradient: {grad_triton[sample_idx].item()}")
+
+    # Compute relative difference for non-zero gradients
+    mask = grad_eager.abs() > 1e-10
+    if mask.sum() > 0:
+        rel_diff = (
+            (
+                (grad_eager[mask] - grad_triton[mask]).abs()
+                / (grad_eager[mask].abs() + 1e-10)
+            )
+            .max()
+            .item()
+        )
+        print(f"Max relative gradient difference: {rel_diff}")
+        assert rel_diff < 1e-3, "Gradients differ significantly!"
+
+    # Also test top_k_before_softmax = 1 mode
+    top_k_before_softmax = 1
+
+    # Reset the gradients
+    student_logits = torch.randn(
+        batch_size, seq_len, vocab_size, requires_grad=True, device="cuda"
+    )
+    student_logits_triton = student_logits.detach().clone().requires_grad_(True)
+
+    # Compute the loss and gradients with eager implementation
+    loss_eager = eager_loss(
+        student_logits,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        num_items_in_batch,
+        kd_temperature,
+        top_k_before_softmax,
+    )
+    loss_eager.backward()
+    grad_eager = student_logits.grad.clone()
+
+    # Compute the loss and gradients with Triton implementation
+    loss_triton = triton_loss(
+        student_logits_triton,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        num_items_in_batch,
+        kd_temperature,
+        top_k_before_softmax,
+    )
+    loss_triton.backward()
+    grad_triton = student_logits_triton.grad.clone()
+
+    # Compare gradients for top_k_before_softmax = 1
+    grad_diff = (grad_eager - grad_triton).abs().max().item()
+    print("\nWith top_k_before_softmax=1:")
+    print(f"Max gradient difference: {grad_diff}")
+
+    # Compute relative difference for non-zero gradients
+    mask = grad_eager.abs() > 1e-10
+    if mask.sum() > 0:
+        rel_diff = (
+            (
+                (grad_eager[mask] - grad_triton[mask]).abs()
+                / (grad_eager[mask].abs() + 1e-10)
+            )
+            .max()
+            .item()
+        )
+        assert (
+            rel_diff < 1e-3
+        ), f"Gradients differ significantly, Max relative gradient difference: {rel_diff}"
--- a/tests/e2e/integrations/test_logsumexp.py
+++ b/tests/e2e/integrations/test_logsumexp.py
@@ -0,0 +1,204 @@
+"""
+sanity checks on logsumexp kernel validity
+"""
+import torch
+import triton
+
+from axolotl.integrations.kd.topk_logprob.logsumexp import logsumexp_kernel
+
+
+# PyTorch implementation of logsumexp for reference
+def torch_logsumexp(logits):
+    """PyTorch implementation of logsumexp over last dimension"""
+    return torch.logsumexp(logits, dim=-1)
+
+
+# Wrapper function for Triton logsumexp kernel
+def triton_logsumexp(logits):
+    """Triton implementation of logsumexp over last dimension"""
+    B, S, V = logits.shape  # pylint: disable=invalid-name
+    output = torch.empty((B, S), dtype=torch.float32, device=logits.device)
+
+    grid = (B * S,)
+    logsumexp_kernel[grid](
+        logits.contiguous(),
+        output,
+        B,
+        S,
+        V,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        output.stride(0),
+        output.stride(1),
+        min(1024, triton.next_power_of_2(V)),
+    )
+
+    return output
+
+
+class TritonLogSumExp(torch.autograd.Function):
+    """
+    Wrap a custom autograd function to use the Triton logsumexp for gradient testing
+    """
+
+    @staticmethod
+    def forward(ctx, logits):
+        B, S, V = logits.shape  # pylint: disable=invalid-name
+        output = torch.empty((B, S), dtype=torch.float32, device=logits.device)
+
+        # Save inputs for backward pass
+        ctx.save_for_backward(logits)
+        ctx.shape = logits.shape
+
+        grid = (B * S,)
+        logsumexp_kernel[grid](
+            logits.contiguous(),
+            output,
+            B,
+            S,
+            V,
+            logits.stride(0),
+            logits.stride(1),
+            logits.stride(2),
+            output.stride(0),
+            output.stride(1),
+            min(1024, triton.next_power_of_2(V)),
+        )
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (logits,) = ctx.saved_tensors
+
+        # For logsumexp, the gradient is softmax(input) * grad_output
+        # First compute the logsumexp
+        lse = TritonLogSumExp.apply(logits)
+
+        # Compute softmax by exponentiating differences
+        softmax_output = torch.exp(logits - lse.unsqueeze(-1))
+
+        # Compute gradient of logsumexp by multiplying the softmax output by the gradient
+        grad_input = softmax_output * grad_output.unsqueeze(-1)
+
+        return grad_input
+
+
+def test_logsumexp_values():
+    """Test that the Triton logsumexp implementation matches PyTorch's"""
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Test with various input shapes
+    test_shapes = [
+        (2, 3, 10),  # small vocab
+        (4, 5, 100),  # medium vocab
+        (2, 2, 32000),  # large vocab (typical for LLMs)
+    ]
+
+    for shape in test_shapes:
+        # Create random input tensors
+        logits = torch.randn(shape, device="cuda", requires_grad=False)
+
+        # Compute logsumexp using both implementations
+        torch_result = torch_logsumexp(logits)
+        triton_result = triton_logsumexp(logits)
+
+        # Compare results
+        max_diff = (torch_result - triton_result).abs().max().item()
+        print(f"Shape {shape}, Max diff: {max_diff}")
+
+        # Assert that the results are very close
+        assert max_diff < 1e-5, f"Results differ for shape {shape}: max diff {max_diff}"
+
+
+def test_logsumexp_edge_cases():
+    """Test edge cases for numerical stability"""
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Case 1: Very large values that might cause overflow
+    logits_large = torch.ones(2, 3, 100, device="cuda") * 1000
+
+    # Case 2: Very small values that might cause underflow
+    logits_small = torch.ones(2, 3, 100, device="cuda") * -1000
+
+    # Case 3: Mix of large and small values
+    logits_mixed = torch.zeros(2, 3, 100, device="cuda")
+    logits_mixed[:, :, 0] = 1000  # One very large value
+
+    # Case 4: All identical values
+    logits_identical = torch.ones(2, 3, 100, device="cuda") * 5
+
+    # Case 5: Extreme values with NaN check
+    logits_extreme = torch.cat(
+        [
+            torch.full((1, 3, 50), 1e10, device="cuda"),
+            torch.full((1, 3, 50), -1e10, device="cuda"),
+        ],
+        dim=0,
+    )
+
+    for i, logits in enumerate(
+        [logits_large, logits_small, logits_mixed, logits_identical, logits_extreme]
+    ):
+        # Compute logsumexp using both implementations
+        torch_result = torch_logsumexp(logits)
+        triton_result = triton_logsumexp(logits)
+
+        # Check for NaNs
+        assert not torch.isnan(
+            torch_result
+        ).any(), f"PyTorch produced NaNs for case {i+1}"
+        assert not torch.isnan(
+            triton_result
+        ).any(), f"Triton produced NaNs for case {i+1}"
+
+        # Compare results
+        max_diff = (torch_result - triton_result).abs().max().item()
+        print(f"Edge case {i+1}, Max diff: {max_diff}")
+
+        # For very extreme values, allow a bit more tolerance
+        if i == 4:  # extreme case
+            assert max_diff < 1e-2, f"Results differ too much for edge case {i+1}"
+        else:
+            assert max_diff < 1e-5, f"Results differ too much for edge case {i+1}"
+
+
+def test_logsumexp_gradients():
+    """Test that the gradients of Triton logsumexp match PyTorch's"""
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Create input tensors with gradients enabled
+    shapes = [(2, 3, 10), (4, 5, 100)]
+
+    for shape in shapes:
+        # Create two identical tensors for PyTorch and Triton
+        logits_torch = torch.randn(shape, device="cuda", requires_grad=True)
+        logits_triton = logits_torch.clone().detach().requires_grad_(True)
+
+        # Forward pass
+        torch_output = torch_logsumexp(logits_torch)
+        triton_output = TritonLogSumExp.apply(logits_triton)
+
+        # Compare forward pass values
+        max_diff_forward = (torch_output - triton_output).abs().max().item()
+        assert max_diff_forward < 1e-5, f"Forward pass values differ for shape {shape}"
+
+        # Create random gradient
+        grad_output = torch.randn_like(torch_output)
+
+        # Backward pass
+        torch_output.backward(grad_output)
+        triton_output.backward(grad_output)
+
+        # Compare gradients
+        max_diff_grad = (logits_torch.grad - logits_triton.grad).abs().max().item()
+        print(f"Shape {shape}, Max gradient diff: {max_diff_grad}")
+
+        # Assert that gradients are very close
+        assert (
+            max_diff_grad < 1e-5
+        ), f"Gradients differ for shape {shape}: max diff {max_diff_grad}"
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -102,7 +102,11 @@ def is_hopper():


 def check_tensorboard(
-    temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
+    temp_run_dir: str,
+    tag: str,
+    comparison_val: float,
+    assertion_err: str,
+    lt: bool = True,
 ) -> None:
    """
    helper function to parse and check tensorboard logs
@@ -112,10 +116,20 @@ def check_tensorboard(
    reader = SummaryReader(event_file)
    df = reader.scalars  # pylint: disable=invalid-name
    df = df[(df.tag == tag)]  # pylint: disable=invalid-name
-    if "%s" in assertion_err:
-        assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
+    if lt:
+        if "%s" in assertion_err:
+            assert df.value.values[-1] < comparison_val, (
+                assertion_err % df.value.values[-1]
+            )
+        else:
+            assert df.value.values[-1] < comparison_val, assertion_err
    else:
-        assert df.value.values[-1] < lt_val, assertion_err
+        if "%s" in assertion_err:
+            assert df.value.values[-1] > comparison_val, (
+                assertion_err % df.value.values[-1]
+            )
+        else:
+            assert df.value.values[-1] > comparison_val, assertion_err


 def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None:
Author	SHA1	Message	Date
Wing Lian	68e97d032a	chunk to prevent overflows in kernel	2025-02-26 04:44:24 -05:00
Wing Lian	23f029a89c	lint and additional train metric checks for kd	2025-02-26 03:19:42 -05:00
Wing Lian	afbb44f08b	more optims	2025-02-26 01:49:47 -05:00
Wing Lian	d753ead033	optimize and include bench util	2025-02-26 01:17:50 -05:00
Wing Lian	c011405117	fix gradients	2025-02-25 23:34:27 -05:00
Wing Lian	a2e52a29e9	chore: lint	2025-02-25 07:29:46 -05:00
Wing Lian	e82268e580	use triton for kd-loss in trainer	2025-02-24 22:58:35 -05:00
Wing Lian	75e1480c10	chunking not necessary	2025-02-24 22:56:15 -05:00
Wing Lian	45e1548d59	fix the kernels	2025-02-24 22:38:55 -05:00
Wing Lian	165088e7c1	triton kernel for top-k logprob kd	2025-02-24 22:13:26 -05:00