From 00dee05fc6bd8fdff3929ba7d953bd968c8bbe30 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sat, 28 Mar 2026 13:15:54 -0400
Subject: [PATCH] support flattening/packing for GRPO (#3552)

* support flattening/packing for GRPO

* more flattening

* fix tests

* improve dead vllm handling

* refactor out process handling for vllm serve and move bench flattening tests to gpu tests

* add validation for flattening with liger

* isolate batch flattening test

* flaky test
---
 src/axolotl/cli/vllm_serve.py                 |   2 +
 src/axolotl/core/trainers/grpo/__init__.py    |   7 +-
 .../core/trainers/grpo/async_trainer.py       | 395 +++++++++--
 src/axolotl/monkeypatch/trainer/trl_vllm.py   |  11 +-
 src/axolotl/scripts/process_cleanup.py        | 232 +++++++
 src/axolotl/scripts/vllm_serve_lora.py        |  79 ++-
 src/axolotl/utils/schemas/validation.py       |  11 +
 src/axolotl/utils/schemas/vllm.py             |   7 +
 tests/e2e/solo/test_batch_flattening.py       | 612 ++++++++++++++++++
 tests/e2e/solo/test_trainer_loss_calc.py      |   3 +
 10 files changed, 1307 insertions(+), 52 deletions(-)
 create mode 100644 src/axolotl/scripts/process_cleanup.py
 create mode 100644 tests/e2e/solo/test_batch_flattening.py

diff --git a/src/axolotl/cli/vllm_serve.py b/src/axolotl/cli/vllm_serve.py
index 2180a9e7f..06822cd78 100644
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -105,6 +105,8 @@ def do_vllm_serve(
         # (merged weight sync via batch_update doesn't need vLLM LoRA mode)
         if not getattr(cfg.trl, "vllm_lora_sync", False):
             lora_kwargs["enable_lora"] = False
+        if getattr(cfg.vllm, "worker_extension_cls", None):
+            lora_kwargs["worker_extension_cls"] = cfg.vllm.worker_extension_cls
         vllm_script_args = LoRAScriptArguments(**base_kwargs, **lora_kwargs)
     else:
         vllm_script_args = AxolotlScriptArguments(
diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index 4a8c0b81d..bb0046e57 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -29,7 +29,7 @@ class GRPOStrategy:
     @classmethod
     def get_trainer_class(
         cls,
-        sequence_parallel: bool,
+        sequence_parallel: bool = False,
         async_grpo: bool = False,
     ) -> (
         type[AxolotlGRPOTrainer]
@@ -88,7 +88,6 @@ class GRPOStrategy:
 
         if trl.num_generations:
             grpo_args_kwargs["num_generations"] = trl.num_generations
-
         if trl.generation_batch_size is not None:
             grpo_args_kwargs["generation_batch_size"] = trl.generation_batch_size
 
@@ -202,6 +201,10 @@ class GRPOStrategy:
         if getattr(trl, "vllm_lora_sync", None) is not None:
             grpo_args_kwargs["vllm_lora_sync"] = trl.vllm_lora_sync
 
+        # Batch flattening (top-level config, not under trl)
+        if getattr(cfg, "batch_flattening", None):
+            grpo_args_kwargs["batch_flattening"] = cfg.batch_flattening
+
         return grpo_args_kwargs
 
     @classmethod
diff --git a/src/axolotl/core/trainers/grpo/async_trainer.py b/src/axolotl/core/trainers/grpo/async_trainer.py
index 3e541c16d..3388687ad 100644
--- a/src/axolotl/core/trainers/grpo/async_trainer.py
+++ b/src/axolotl/core/trainers/grpo/async_trainer.py
@@ -32,6 +32,7 @@ from dataclasses import dataclass, field
 from typing import Any
 
 import torch
+import torch.nn.functional as F
 from torch.utils.data import DataLoader, Dataset
 from trl.extras.profiling import profiling_decorator
 from trl.trainer import GRPOConfig, GRPOTrainer
@@ -129,6 +130,18 @@ class AsyncGRPOConfig(GRPOConfig):
         },
     )
 
+    # --- Batch flattening ---
+    batch_flattening: bool = field(
+        default=False,
+        metadata={
+            "help": "Use batch flattening for the scoring forward pass. Removes padding tokens "
+            "before the forward pass, reducing attention FLOPs proportional to the padding ratio. "
+            "Requires flash_attention_2 attention implementation. Incompatible with FSDP and "
+            "multimodal models. The per-token logprob results differ by bf16 precision (~0.03 mean) "
+            "but produce equivalent loss and gradients."
+        },
+    )
+
     # --- Streaming scoring ---
     streaming_partial_batch: bool = field(
         default=False,
@@ -523,7 +536,10 @@ class GRPODataProducer(BaseDataProducer):
     def set_trainer(self, trainer) -> None:
         """Inject the live trainer reference and create the prompt DataLoader."""
         self._trainer = trainer
-        self._init_prompt_dataloader()
+        # Defer _init_prompt_dataloader if trainer.args is not yet set
+        # (happens when set_trainer is called from _create_data_producer during __init__)
+        if getattr(trainer, "args", None) is not None:
+            self._init_prompt_dataloader()
 
     def _init_prompt_dataloader(self) -> None:
         from functools import partial
@@ -580,6 +596,10 @@ class GRPODataProducer(BaseDataProducer):
         **kwargs,
     ) -> RolloutDataset | None:
         """Generate a fresh GRPO training rollout."""
+        # Lazy init: create prompt DataLoader if deferred from set_trainer
+        if self._prompt_dl is None and self._trainer is not None:
+            self._init_prompt_dataloader()
+
         is_main = self._trainer.accelerator.is_main_process
 
         # FSDP rank0-only mode: non-rank-0 returns None (broadcast fills it later)
@@ -1610,6 +1630,16 @@ class AsyncGRPOTrainer(GRPOTrainer):
         self._launch_reward_workers(inputs, prompts, completions, completion_ids_list)
 
         # --- Policy logprobs ---
+        # When batch_flattening is enabled, use the flattened (padding-free) forward
+        # pass for the scoring path. This removes padding tokens before the forward
+        # pass, reducing attention FLOPs proportional to the padding ratio (20-34%
+        # faster in benchmarks). Requires flash_attention_2 and no multimodal inputs.
+        can_flatten = (
+            getattr(self.args, "batch_flattening", False)
+            and not forward_kwargs  # no multimodal inputs
+            and not self.is_fsdp_enabled  # FSDP needs wrapped model
+        )
+
         logprob_batch_size = min(batch_size * 4, len(prompt_ids))
         with disable_gradient_checkpointing(
             self.model, self.args.gradient_checkpointing_kwargs
@@ -1619,15 +1649,25 @@ class AsyncGRPOTrainer(GRPOTrainer):
                 self.use_vllm
                 and getattr(self, "vllm_importance_sampling_correction", False)
             ):
-                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
-                    self.model,
-                    prompt_completion_ids,
-                    attention_mask,
-                    logits_to_keep,
-                    logprob_batch_size,
-                    num_images=num_images,
-                    **forward_kwargs,
-                )
+                if can_flatten:
+                    old_per_token_logps = self._get_per_token_logps_flattened(
+                        self.model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=logprob_batch_size,
+                        prompt_mask=prompt_mask,
+                    )
+                else:
+                    old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        logprob_batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,
+                    )
                 data["old_per_token_logps"] = old_per_token_logps
             else:
                 old_per_token_logps = None
@@ -1988,6 +2028,11 @@ class AsyncGRPOTrainer(GRPOTrainer):
         self._launch_reward_workers(inputs, prompts, completions, completion_ids_list)
 
         # --- Policy logprobs for this chunk (GPU, overlaps with BG rewards) ---
+        can_flatten = (
+            getattr(self.args, "batch_flattening", False)
+            and not forward_kwargs
+            and not self.is_fsdp_enabled
+        )
         logprob_batch_size = min(batch_size * 2, chunk_size)
         with disable_gradient_checkpointing(
             self.model, self.args.gradient_checkpointing_kwargs
@@ -1997,15 +2042,25 @@ class AsyncGRPOTrainer(GRPOTrainer):
                 self.use_vllm
                 and getattr(self, "vllm_importance_sampling_correction", False)
             ):
-                old_logps, _ = self._get_per_token_logps_and_entropies(
-                    self.model,
-                    prompt_completion_ids,
-                    attention_mask,
-                    logits_to_keep,
-                    logprob_batch_size,
-                    num_images=num_images,
-                    **forward_kwargs,
-                )
+                if can_flatten:
+                    old_logps = self._get_per_token_logps_flattened(
+                        self.model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=logprob_batch_size,
+                        prompt_mask=chunk_prompt_mask,
+                    )
+                else:
+                    old_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        logprob_batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,
+                    )
                 if "old_per_token_logps" not in data:
                     total = len(data["prompt_ids"])
                     data["old_per_token_logps"] = torch.zeros(
@@ -2354,7 +2409,38 @@ class AsyncGRPOTrainer(GRPOTrainer):
         return super()._prepare_inputs(generation_batch)
 
     def _prepare_inputs_data_producer(self, generation_batch):
-        """Data producer path: produce rollout, score deferred logps, split into micro-batches."""
+        """Data producer path: produce rollout, score deferred logps, split into micro-batches.
+
+        Architecture (with async_prefetch=True):
+          BG thread:  produce(skip_policy_logps=True) → vLLM generation + reward computation
+          Main thread: deferred scoring (policy logprobs via GPU forward pass) → training
+
+        Why deferred scoring is necessary for stable training:
+          The policy logprobs (old_per_token_logps) must come from the CURRENT
+          training model, not the vLLM model (which is N steps behind). Using
+          stale vLLM logprobs as old_logps causes the importance sampling ratio
+          to start far from 1.0, leading to:
+            - Immediate PPO clipping → wasted samples
+            - High-variance gradients from IS correction
+            - Compounding per-token ratio errors on long sequences
+            - In extreme cases, complete training failure (exp-003: accuracy=0)
+
+          Deferred scoring computes old_logps with the latest model weights, so
+          the IS ratio starts at exactly 1.0 and drifts gradually — giving
+          maximum useful gradient signal before clipping activates.
+
+          Cost: one additional forward pass per scoring round (GPU-bound, cannot
+          overlap with training on the same GPU). Use ``batch_flattening: true``
+          to reduce this cost by eliminating padding tokens from the forward pass.
+
+        Pipeline:
+          [produce(BG)] → [deferred_scores(GPU)] → [train×GA(GPU)] → [weight_sync]
+                           ↑ can't overlap with train (same GPU)
+
+        Bottleneck: the produce() wait (generation-limited) dominates when
+        generation is slower than training + scoring. Async prefetch hides
+        part of this by generating in the BG thread while training runs.
+        """
         # Return from buffer if available
         if self._buffered_inputs:
             return self._buffered_inputs.pop(0)
@@ -2370,10 +2456,8 @@ class AsyncGRPOTrainer(GRPOTrainer):
             args=self.args,
         )
 
-        # Convert RolloutDataset back to a dict for scoring/splitting
         rollout = rollout_dataset._data
 
-        # If async (skip_policy_logps=True), score deferred logps on main thread
         if rollout.get("_pending_policy_logps"):
             if self.args.streaming_partial_batch:
                 micro_batches = self._score_streaming(rollout)
@@ -2385,7 +2469,6 @@ class AsyncGRPOTrainer(GRPOTrainer):
                 micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches]
                 micro_batches = micro_batches * self.num_iterations
         else:
-            # Sync path: data is already fully scored
             rollout = split_pixel_values_by_grid(rollout)
             batches = split_tensor_dict(rollout, self.args.steps_per_generation)
             micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches]
@@ -2428,6 +2511,219 @@ class AsyncGRPOTrainer(GRPOTrainer):
 
         return micro_batches[0]
 
+    def _get_per_token_logps_flattened(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        batch_size=None,
+        prompt_mask=None,
+    ) -> torch.Tensor:
+        """Compute per-token log-probs using batch flattening (padding-free).
+
+        Instead of processing padded batches where attention wastes compute on
+        padding tokens, this method:
+        1. Chunks the batch into sub-batches of ``batch_size`` sequences
+        2. For each chunk, flattens non-padding tokens into [1, chunk_tokens]
+        3. Uses FlashAttentionKwargs (cu_seq_lens) for varlen attention
+        4. Computes selective_log_softmax on the flat logits
+        5. Gathers completion logprobs back to (B, logits_to_keep) padded format
+
+        Args:
+            prompt_mask: (B, L) mask where 1 = prompt token, 0 = completion/padding.
+                Used to determine the exact prompt length per sequence for correct
+                logprob gathering. If None, inferred as seq_len - logits_to_keep.
+
+        Chunking prevents OOM when the total flattened sequence is too long
+        (e.g., 32 sequences × 2048 tokens = 65K tokens → 20GB logits tensor).
+
+        Requires flash_attention_2 attention implementation.
+        """
+        if not self.is_fsdp_enabled:
+            model = self.accelerator.unwrap_model(model, keep_fp32_wrapper=False)
+
+        device = input_ids.device
+        B, L = input_ids.shape
+        if batch_size is None:
+            batch_size = max(1, B)
+
+        autocast_ctx = torch.autocast(device_type=device.type, dtype=torch.bfloat16)
+        all_logps = torch.zeros(B, logits_to_keep, device=device)
+
+        for chunk_start in range(0, B, batch_size):
+            chunk_end = min(chunk_start + batch_size, B)
+            chunk_ids = input_ids[chunk_start:chunk_end]
+            chunk_mask = attention_mask[chunk_start:chunk_end]
+            n = chunk_end - chunk_start
+
+            seq_lens = chunk_mask.sum(dim=1).to(torch.int32)
+            total_tokens = seq_lens.sum().item()
+            cu_seqlens = torch.zeros(n + 1, dtype=torch.int32, device=device)
+            cu_seqlens[1:] = seq_lens.cumsum(0)
+
+            valid = chunk_mask.bool()
+            flat_ids = chunk_ids[valid].unsqueeze(0)
+            positions = torch.arange(L, device=device).unsqueeze(0).expand(n, L)
+            flat_pos = positions[valid].unsqueeze(0)
+
+            with autocast_ctx:
+                logits = model(
+                    input_ids=flat_ids,
+                    position_ids=flat_pos,
+                    use_cache=False,
+                    cu_seq_lens_q=cu_seqlens,
+                    cu_seq_lens_k=cu_seqlens,
+                    max_length_q=seq_lens.max().item(),
+                    max_length_k=seq_lens.max().item(),
+                ).logits
+                logits = torch.nan_to_num(logits, nan=0.0)
+
+                # Compute logprobs on the flat shifted tensor
+                flat_logits = logits[0, :-1, :] / self.temperature
+                flat_targets = flat_ids[0, 1:]
+                flat_logps = selective_log_softmax(
+                    flat_logits.unsqueeze(0), flat_targets.unsqueeze(0)
+                )[0]
+
+                # Mask out cross-sequence boundary positions. In the shifted
+                # tensor, position cu_seqlens[i]-1 (for i>0) is where sequence
+                # i-1's last token "predicts" sequence i's first token — garbage.
+                for boundary in cu_seqlens[1:-1]:
+                    idx = boundary.item() - 1
+                    if 0 <= idx < flat_logps.size(0):
+                        flat_logps[idx] = 0.0
+
+            # Gather completion logprobs per sequence.
+            # Use prompt_mask to determine exact prompt length (not logits_to_keep,
+            # which is the padded completion dimension and may exceed the actual
+            # completion length for shorter sequences).
+            for i in range(n):
+                slen = seq_lens[i].item()
+                abs_i = chunk_start + i  # absolute index in the full batch
+                if prompt_mask is not None:
+                    plen = int(prompt_mask[abs_i].sum().item())
+                else:
+                    plen = max(1, slen - logits_to_keep)
+                n_compl = slen - plen
+                start = cu_seqlens[i].item() + plen - 1
+                start = max(0, start)
+                actual = min(n_compl, total_tokens - 1 - start)
+                if actual > 0:
+                    all_logps[chunk_start + i, :actual] = flat_logps[
+                        start : start + actual
+                    ]
+
+            del logits, flat_logits, flat_logps, flat_ids
+            torch.cuda.empty_cache()
+
+        return all_logps
+
+    def _get_per_token_logps_and_entropies_flattened(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        batch_size=None,
+        prompt_mask=None,
+        compute_entropy=True,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Flattened forward pass for training (with gradients).
+
+        Same padding removal as the scoring path, but:
+        - Gradients flow through for backward pass
+        - Computes entropy alongside logprobs
+        - Per-sequence logprob/entropy extraction preserves grad graph
+        """
+        device = input_ids.device
+        B, L = input_ids.shape
+        if batch_size is None:
+            batch_size = max(1, B)
+
+        autocast_ctx = torch.autocast(device_type=device.type, dtype=torch.bfloat16)
+
+        # Pre-allocate output containers (will be filled with grad-carrying slices)
+        all_logps_list: list[torch.Tensor] = []
+        all_entropy_list: list[torch.Tensor] = []
+
+        for chunk_start in range(0, B, batch_size):
+            chunk_end = min(chunk_start + batch_size, B)
+            chunk_ids = input_ids[chunk_start:chunk_end]
+            chunk_mask = attention_mask[chunk_start:chunk_end]
+            n = chunk_end - chunk_start
+
+            seq_lens = chunk_mask.sum(dim=1).to(torch.int32)
+            cu_seqlens = torch.zeros(n + 1, dtype=torch.int32, device=device)
+            cu_seqlens[1:] = seq_lens.cumsum(0)
+
+            valid = chunk_mask.bool()
+            flat_ids = chunk_ids[valid].unsqueeze(0)
+            positions = torch.arange(L, device=device).unsqueeze(0).expand(n, L)
+            flat_pos = positions[valid].unsqueeze(0)
+
+            with autocast_ctx:
+                logits = model(
+                    input_ids=flat_ids,
+                    position_ids=flat_pos,
+                    use_cache=False,
+                    cu_seq_lens_q=cu_seqlens,
+                    cu_seq_lens_k=cu_seqlens,
+                    max_length_q=seq_lens.max().item(),
+                    max_length_k=seq_lens.max().item(),
+                ).logits
+                logits = torch.nan_to_num(logits, nan=0.0)
+
+            # Extract logprobs and entropy per-sequence (avoids cross-sequence targets,
+            # preserves gradient graph through selective_log_softmax → logits → model)
+            for i in range(n):
+                slen = seq_lens[i].item()
+                abs_i = chunk_start + i
+                if prompt_mask is not None:
+                    plen = int(prompt_mask[abs_i].sum().item())
+                else:
+                    plen = max(1, slen - logits_to_keep)
+                n_compl = slen - plen
+                s = cu_seqlens[i].item()
+
+                if n_compl <= 0:
+                    # No completion tokens — append zeros
+                    all_logps_list.append(torch.zeros(logits_to_keep, device=device))
+                    if compute_entropy:
+                        all_entropy_list.append(
+                            torch.zeros(logits_to_keep, device=device)
+                        )
+                    continue
+
+                with autocast_ctx:
+                    # Shifted logits and targets for this sequence only
+                    seq_logits = logits[0, s + plen - 1 : s + slen - 1, :]
+                    seq_logits = seq_logits / self.temperature
+                    seq_targets = flat_ids[0, s + plen : s + slen]
+
+                    # Log probs (differentiable)
+                    lps = selective_log_softmax(
+                        seq_logits.unsqueeze(0), seq_targets.unsqueeze(0)
+                    )[0]  # (n_compl,)
+
+                    # Pad to logits_to_keep
+                    if n_compl < logits_to_keep:
+                        lps = F.pad(lps, (0, logits_to_keep - n_compl))
+                    all_logps_list.append(lps[:logits_to_keep])
+
+                    if compute_entropy:
+                        ent = entropy_from_logits(seq_logits)  # (n_compl,)
+                        if n_compl < logits_to_keep:
+                            ent = F.pad(ent, (0, logits_to_keep - n_compl))
+                        all_entropy_list.append(ent[:logits_to_keep])
+
+        # Stack per-sequence results into (B, logits_to_keep) tensors
+        all_logps = torch.stack(all_logps_list, dim=0)
+        all_entropies = (
+            torch.stack(all_entropy_list, dim=0) if compute_entropy else None
+        )
+        return all_logps, all_entropies
+
     @profiling_decorator
     def _get_per_token_logps_and_entropies(
         self,
@@ -2599,20 +2895,47 @@ class AsyncGRPOTrainer(GRPOTrainer):
             else completion_mask * inputs["tool_mask"]
         )
 
-        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
-            model,
-            input_ids,
-            attention_mask,
-            logits_to_keep,
-            compute_entropy=True,
-            pixel_values=inputs.get("pixel_values"),
-            image_grid_thw=inputs.get("image_grid_thw"),
-            num_images=inputs.get("num_images"),
-            pixel_attention_mask=inputs.get("pixel_attention_mask"),
-            image_sizes=inputs.get("image_sizes"),
-            token_type_ids=inputs.get("token_type_ids"),
-            mm_token_type_ids=inputs.get("mm_token_type_ids"),
+        # Check for multimodal inputs
+        forward_kwargs = {
+            k: inputs[k]
+            for k in (
+                "pixel_values",
+                "image_grid_thw",
+                "num_images",
+                "pixel_attention_mask",
+                "image_sizes",
+                "token_type_ids",
+                "mm_token_type_ids",
+            )
+            if k in inputs and inputs[k] is not None
+        }
+
+        can_flatten = (
+            getattr(self.args, "batch_flattening", False)
+            and not forward_kwargs
+            and not self.is_fsdp_enabled
         )
+
+        if can_flatten:
+            per_token_logps, entropies = (
+                self._get_per_token_logps_and_entropies_flattened(
+                    model,
+                    input_ids,
+                    attention_mask,
+                    logits_to_keep,
+                    prompt_mask=prompt_mask,
+                    compute_entropy=True,
+                )
+            )
+        else:
+            per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+                model,
+                input_ids,
+                attention_mask,
+                logits_to_keep,
+                compute_entropy=True,
+                **forward_kwargs,
+            )
         if self.top_entropy_quantile < 1.0:
             entropy_mask = self.get_high_entropy_mask(
                 entropies, mask, 1 - self.top_entropy_quantile
diff --git a/src/axolotl/monkeypatch/trainer/trl_vllm.py b/src/axolotl/monkeypatch/trainer/trl_vllm.py
index e3f57ccf5..a234bbf3c 100644
--- a/src/axolotl/monkeypatch/trainer/trl_vllm.py
+++ b/src/axolotl/monkeypatch/trainer/trl_vllm.py
@@ -57,7 +57,16 @@ def _batch_update_named_params(
             response = self.session.post(
                 url, json={"params": param_metadata}, timeout=120
             )
-            if response.status_code != 200:
+            if response.status_code == 404:
+                # Server doesn't support batch endpoint — fall back to individual updates
+                for meta in param_metadata:
+                    ind_url = f"{self.base_url}/update_named_param/"
+                    ind_response = self.session.post(ind_url, json=meta, timeout=120)
+                    if ind_response.status_code != 200:
+                        raise Exception(
+                            f"Individual update failed: {ind_response.status_code}, {ind_response.text}"
+                        )
+            elif response.status_code != 200:
                 raise Exception(
                     f"Request failed: {response.status_code}, {response.text}"
                 )
diff --git a/src/axolotl/scripts/process_cleanup.py b/src/axolotl/scripts/process_cleanup.py
new file mode 100644
index 000000000..56f294375
--- /dev/null
+++ b/src/axolotl/scripts/process_cleanup.py
@@ -0,0 +1,232 @@
+"""Reusable process lifecycle management for vLLM serve scripts.
+
+Handles graceful shutdown, orphan cleanup, and health monitoring for
+multiprocessing-based server architectures where a main process
+dispatches work to worker subprocesses that spawn GPU-heavy children
+(e.g., vLLM EngineCore).
+
+Usage:
+
+    from axolotl.scripts.process_cleanup import ProcessManager
+
+    manager = ProcessManager(processes, connections)
+    manager.register_signal_handlers()
+
+    # In FastAPI lifespan:
+    async with manager.lifespan_context():
+        yield  # server runs here
+
+    # In endpoints:
+    manager.check_workers_alive()  # raises if dead
+
+    # In worker command loop:
+    if manager.is_fatal_error(exc):
+        break  # exit worker
+"""
+
+import asyncio
+import atexit
+import logging
+import os
+from multiprocessing import Process
+from multiprocessing.connection import Connection
+
+logger = logging.getLogger(__name__)
+
+
+def kill_process_tree(pid: int) -> None:
+    """Kill a process and all its descendants (depth-first)."""
+    import subprocess  # nosec B404
+
+    try:
+        result = subprocess.run(  # nosec B603 B607
+            ["pgrep", "-P", str(pid)],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.returncode == 0:
+            for child_pid in result.stdout.strip().split("\n"):
+                child_pid = child_pid.strip()
+                if child_pid:
+                    kill_process_tree(int(child_pid))
+    except (FileNotFoundError, ValueError):
+        pass
+
+    try:
+        os.kill(pid, 9)
+    except (ProcessLookupError, PermissionError):
+        pass
+
+
+def cleanup_orphan_processes(*patterns: str) -> None:
+    """Kill orphan processes matching any of the given patterns.
+
+    Uses ``pgrep -f`` to find processes. Skips the current process.
+    Intended for cleaning up GPU-holding subprocesses (EngineCore)
+    that survive their parent's death.
+    """
+    import subprocess  # nosec B404
+
+    my_pid = os.getpid()
+    for pattern in patterns:
+        try:
+            result = subprocess.run(  # nosec B603 B607
+                ["pgrep", "-f", pattern],
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if result.returncode == 0:
+                for pid in result.stdout.strip().split("\n"):
+                    pid = pid.strip()
+                    if pid and int(pid) != my_pid:
+                        try:
+                            os.kill(int(pid), 9)
+                            logger.info("Killed orphan process %s (%s)", pid, pattern)
+                        except (ProcessLookupError, ValueError):
+                            pass
+        except FileNotFoundError:
+            pass
+
+
+def is_fatal_worker_error(exc: Exception) -> bool:
+    """Check if an exception indicates the worker should exit.
+
+    Returns True for errors from which the worker cannot recover,
+    such as the vLLM EngineCore dying.
+    """
+    exc_str = str(exc)
+    exc_type = type(exc).__name__
+    return (
+        "EngineCore" in exc_str
+        or "EngineDeadError" in exc_type
+        or "engine" in exc_str.lower()
+        and "died" in exc_str.lower()
+    )
+
+
+def safe_recv(conn: Connection):
+    """Receive from a pipe, returning an error dict if the pipe is broken."""
+    try:
+        return conn.recv()
+    except EOFError:
+        return {"error": "Worker process died (pipe closed)", "kind": "worker_dead"}
+
+
+class ProcessManager:
+    """Manages worker process lifecycle for a FastAPI-based serve script.
+
+    Handles:
+    - Signal-based shutdown (SIGTERM)
+    - Background health monitoring (detects dead workers)
+    - Process tree cleanup on exit
+    - Orphan EngineCore cleanup
+
+    Args:
+        processes: List of worker Process objects.
+        connections: List of parent-side Pipe connections to workers.
+        orphan_patterns: Process name patterns to search for orphans on cleanup.
+            Defaults to ``["VLLM::EngineCore"]``.
+        monitor_interval: Seconds between worker health checks.
+        shutdown_timeout: Seconds to wait for graceful worker exit before SIGTERM.
+        kill_timeout: Seconds to wait after SIGTERM before SIGKILL.
+    """
+
+    def __init__(
+        self,
+        processes: list[Process],
+        connections: list[Connection],
+        orphan_patterns: list[str] | None = None,
+        monitor_interval: float = 5.0,
+        shutdown_timeout: float = 30.0,
+        kill_timeout: float = 15.0,
+    ):
+        self.processes = processes
+        self.connections = connections
+        self.orphan_patterns = orphan_patterns or ["VLLM::EngineCore"]
+        self.monitor_interval = monitor_interval
+        self.shutdown_timeout = shutdown_timeout
+        self.kill_timeout = kill_timeout
+
+    def register_cleanup(self) -> None:
+        """Register atexit cleanup for orphan processes.
+
+        Does NOT override SIGTERM — let uvicorn handle it naturally,
+        which triggers the lifespan shutdown where ``_shutdown_workers``
+        runs. The atexit handler is a safety net for abnormal exits.
+        """
+        atexit.register(self._cleanup_orphans)
+
+    def check_workers_alive(self) -> None:
+        """Raise RuntimeError if any worker process has died.
+
+        Call this at the start of request handlers to fail fast
+        instead of hanging on a broken pipe.
+        """
+        dead = [i for i, p in enumerate(self.processes) if not p.is_alive()]
+        if dead:
+            raise RuntimeError(
+                f"vLLM worker(s) {dead} died. Restart the server to recover."
+            )
+
+    def get_health_status(self) -> dict:
+        """Return health status dict. Use as the /health endpoint response."""
+        dead = [i for i, p in enumerate(self.processes) if not p.is_alive()]
+        if dead:
+            return {
+                "status": "unhealthy",
+                "dead_workers": dead,
+                "message": "Worker(s) died. Restart the server.",
+            }
+        return {"status": "ok"}
+
+    async def monitor_workers(self) -> None:
+        """Background coroutine that detects dead workers and exits.
+
+        When all workers are dead, cleans up their process trees and
+        orphan subprocesses, then force-exits the server.
+        """
+        while True:
+            await asyncio.sleep(self.monitor_interval)
+            alive = [p.is_alive() for p in self.processes]
+            if not any(alive):
+                logger.error(
+                    "All vLLM workers died. Shutting down server. "
+                    "Check logs for EngineCore errors and restart."
+                )
+                # Kill process trees for any workers that left orphans
+                for p in self.processes:
+                    if p.pid is not None:
+                        kill_process_tree(p.pid)
+                self._cleanup_orphans()
+                os._exit(1)
+
+    def _shutdown_workers(self) -> None:
+        """Send shutdown commands and escalate to kill if needed."""
+        for conn in self.connections:
+            try:
+                conn.send({"type": "shutdown"})
+            except Exception:
+                pass
+        for i, p in enumerate(self.processes):
+            if not p.is_alive():
+                continue
+            p.join(timeout=self.shutdown_timeout)
+            if p.is_alive():
+                logger.warning(
+                    "Worker %d didn't exit in %.0fs, sending SIGTERM",
+                    i,
+                    self.shutdown_timeout,
+                )
+                p.terminate()
+                p.join(timeout=self.kill_timeout)
+            if p.is_alive():
+                logger.warning("Worker %d didn't respond to SIGTERM, force killing", i)
+                p.kill()
+                p.join(timeout=5)
+        self._cleanup_orphans()
+        logger.info("Worker shutdown complete")
+
+    def _cleanup_orphans(self) -> None:
+        cleanup_orphan_processes(*self.orphan_patterns)
diff --git a/src/axolotl/scripts/vllm_serve_lora.py b/src/axolotl/scripts/vllm_serve_lora.py
index 9ca8a9134..344c4327f 100644
--- a/src/axolotl/scripts/vllm_serve_lora.py
+++ b/src/axolotl/scripts/vllm_serve_lora.py
@@ -38,6 +38,12 @@ except ImportError:
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
 
+from axolotl.scripts.process_cleanup import (
+    ProcessManager,
+    is_fatal_worker_error,
+    safe_recv,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -61,6 +67,10 @@ class LoRAScriptArguments(ScriptArguments):
         default="bfloat16",
         metadata={"help": "Data type for LoRA weights."},
     )
+    worker_extension_cls: str = field(
+        default="trl.scripts.vllm_serve.WeightSyncWorkerExtension",
+        metadata={"help": "vLLM worker extension class for weight synchronization."},
+    )
 
 
 def llm_worker(
@@ -96,8 +106,7 @@ def llm_worker(
         enable_prefix_caching=script_args.enable_prefix_caching,
         kv_cache_dtype=script_args.kv_cache_dtype,
         max_model_len=script_args.max_model_len,
-        # Use batch-capable worker extension (adds batch_update_named_params + auto-close)
-        worker_extension_cls="axolotl.scripts.vllm_worker_ext.BatchWeightSyncWorkerExtension",
+        worker_extension_cls=script_args.worker_extension_cls,
         trust_remote_code=script_args.trust_remote_code,
         model_impl=script_args.vllm_model_impl,
         logprobs_mode="processed_logprobs",
@@ -110,11 +119,28 @@ def llm_worker(
 
     connection.send({"status": "ready"})
 
+    def _worker_cleanup():
+        """Clean up the LLM and its EngineCore subprocess on worker exit."""
+        from axolotl.scripts.process_cleanup import cleanup_orphan_processes
+
+        try:
+            llm.collective_rpc(method="close_communicator")
+        except Exception:
+            pass
+        # Kill EngineCore children of this worker
+        cleanup_orphan_processes("VLLM::EngineCore")
+
+    import atexit as _atexit
+
+    _atexit.register(_worker_cleanup)
+
     while True:
         try:
             command = connection.recv()
-        except KeyboardInterrupt:
-            llm.collective_rpc(method="close_communicator")
+        except (KeyboardInterrupt, EOFError):
+            break
+
+        if command.get("type") == "shutdown":
             break
 
         if command["type"] in ["call", "fire_and_forget"]:
@@ -139,6 +165,12 @@ def llm_worker(
                 logger.warning("Worker method %s failed: %s", method_name, exc)
                 if command["type"] == "call":
                     connection.send({"error": str(exc), "kind": "worker_error"})
+                if is_fatal_worker_error(exc):
+                    logger.error(
+                        "Fatal worker error (EngineCore died), exiting. "
+                        "Restart the vLLM server to recover."
+                    )
+                    break
                 continue
             if command["type"] == "call":
                 connection.send(result)
@@ -156,7 +188,7 @@ def main(script_args: ScriptArguments):
 
     # Request/Response models (defined locally like TRL's vllm_serve.main)
     class GenerateRequest(BaseModel):
-        prompts: list[str]
+        prompts: list[str] | list[list[int]]
         images: list[str] | None = None
         n: int = 1
         repetition_penalty: float = 1.0
@@ -230,6 +262,10 @@ def main(script_args: ScriptArguments):
         connections.append(parent_conn)
         processes.append(process)
 
+    # Process lifecycle management
+    manager = ProcessManager(processes, connections)
+    manager.register_cleanup()
+
     @asynccontextmanager
     async def lifespan(app: FastAPI):
         import time
@@ -256,12 +292,11 @@ def main(script_args: ScriptArguments):
                     if isinstance(msg, dict) and msg.get("status") == "ready":
                         ready.add(id(conn))
             await asyncio.sleep(0.1)
+
+        monitor_task = asyncio.create_task(manager.monitor_workers())
         yield
-        for p in processes:
-            p.join(timeout=10)
-            if p.is_alive():
-                p.terminate()
-                p.join()
+        monitor_task.cancel()
+        manager._shutdown_workers()
 
     app = FastAPI(lifespan=lifespan)
 
@@ -324,7 +359,12 @@ def main(script_args: ScriptArguments):
 
     @app.get("/health/")
     async def health():
-        return {"status": "ok"}
+        status = manager.get_health_status()
+        if status["status"] != "ok":
+            from fastapi.responses import JSONResponse
+
+            return JSONResponse(status_code=503, content=status)
+        return status
 
     @app.get("/get_world_size/")
     async def get_world_size():
@@ -336,6 +376,8 @@ def main(script_args: ScriptArguments):
     @app.post("/generate/", response_model=GenerateResponse)
     async def generate(request: GenerateRequest):
         """Generate completions with optional LoRA adapter."""
+        manager.check_workers_alive()
+
         import base64
         from io import BytesIO
 
@@ -350,7 +392,12 @@ def main(script_args: ScriptArguments):
         images: list[str | None] = request.images or [None] * len(request.prompts)  # type: ignore[assignment,list-item]
         prompts: list[dict[str, Any]] = []
         for prompt, image in zip(request.prompts, images, strict=True):
-            row: dict[str, Any] = {"prompt": prompt}
+            # Support both string prompts and token ID lists
+            row: dict[str, Any]
+            if isinstance(prompt, list):
+                row = {"prompt_token_ids": prompt}
+            else:
+                row = {"prompt": prompt}
             if image is not None:
                 from PIL import Image
 
@@ -410,12 +457,17 @@ def main(script_args: ScriptArguments):
         # Use run_in_executor so blocking recv() doesn't freeze the event loop
         # (allows /set_lora_adapter/ and other endpoints to be served concurrently)
         loop = asyncio.get_running_loop()
+
         all_outputs = await asyncio.gather(
-            *(loop.run_in_executor(None, conn.recv) for conn in connections)
+            *(loop.run_in_executor(None, safe_recv, conn) for conn in connections)
         )
         all_outputs = [
             o for o, c in zip(all_outputs, chunked_prompts, strict=True) if c
         ]
+        # Check for worker errors before flattening
+        for o in all_outputs:
+            if isinstance(o, dict) and "error" in o:
+                raise RuntimeError(f"vLLM worker error: {o['error']}")
         all_outputs = list(chain.from_iterable(all_outputs))
 
         return {
@@ -430,6 +482,7 @@ def main(script_args: ScriptArguments):
     @app.post("/chat/", response_model=ChatResponse)
     async def chat(request: ChatRequest):
         """Chat endpoint with optional LoRA adapter."""
+        manager.check_workers_alive()
         generation_kwargs = {
             "n": request.n,
             "repetition_penalty": request.repetition_penalty,
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index c7eeb6fa4..f665a99ff 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -837,6 +837,17 @@ class OptimizationValidationMixin:
             if data.get("micro_batch_size") == 1 and not batch_flattening_auto:
                 LOG.warning("batch_flattening has no effect with micro_batch_size == 1")
 
+            # Liger loss takes a separate code path (compute_liger_loss) that
+            # bypasses the flattened training forward pass. Batch flattening
+            # still applies to the scoring/deferred logprobs path.
+            trl_cfg = data.get("trl") or {}
+            if isinstance(trl_cfg, dict) and trl_cfg.get("use_liger_loss"):
+                LOG.warning(
+                    "batch_flattening with use_liger_loss: flattening will only "
+                    "apply to the scoring path (deferred logprobs). The training "
+                    "forward pass uses Liger's fused lm_head+loss kernel instead."
+                )
+
             if (
                 batch_flattening_auto
                 and data.get("flash_attention")
diff --git a/src/axolotl/utils/schemas/vllm.py b/src/axolotl/utils/schemas/vllm.py
index 5198d4173..c1f010b02 100644
--- a/src/axolotl/utils/schemas/vllm.py
+++ b/src/axolotl/utils/schemas/vllm.py
@@ -71,3 +71,10 @@ class VllmConfig(BaseModel):
             "for native LoRA support, or leave None for default TRL serve."
         },
     )
+    worker_extension_cls: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "vLLM worker extension class for weight synchronization. "
+            "Defaults to 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'."
+        },
+    )
diff --git a/tests/e2e/solo/test_batch_flattening.py b/tests/e2e/solo/test_batch_flattening.py
new file mode 100644
index 000000000..80b7b0259
--- /dev/null
+++ b/tests/e2e/solo/test_batch_flattening.py
@@ -0,0 +1,612 @@
+"""
+Unit tests for batch flattening correctness in GRPO.
+
+Validates that flattened (padding-free) forward passes produce identical
+results to padded forward passes by calling the ACTUAL AsyncGRPOTrainer methods:
+  1. Deferred scoring: _get_per_token_logps_flattened vs _get_per_token_logps_and_entropies
+  2. Training loss: _get_per_token_logps_and_entropies_flattened vs _get_per_token_logps_and_entropies
+
+Run: CUDA_VISIBLE_DEVICES=1 python test_batch_flattening.py
+"""
+
+import types
+from unittest.mock import MagicMock
+
+import torch
+from transformers import AutoModelForCausalLM
+
+# Import the actual trainer methods we want to test
+from axolotl.core.trainers.grpo.async_trainer import AsyncGRPOTrainer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+def _fix_patched_attention(model):
+    """Bind apply_qkv on attention modules if LoRA kernel monkeypatch is active.
+
+    The LoRA kernel tests replace ``Qwen3Attention.forward`` at the class level
+    with ``axolotl_attn_forward``, which expects a per-instance ``apply_qkv``
+    method.  Models created *after* that patch but *without* the per-instance
+    setup will crash.  We fix this by binding the original (non-LoRA) apply_qkv.
+    """
+    from axolotl.monkeypatch.lora_kernels import original_apply_o, original_apply_qkv
+
+    for module in model.modules():
+        fwd_name = getattr(type(module).forward, "__name__", "")
+        if "axolotl" in fwd_name and not hasattr(module, "apply_qkv"):
+            module.apply_qkv = types.MethodType(original_apply_qkv, module)
+            module.apply_o = types.MethodType(original_apply_o, module)
+
+
+def setup_model(eval_mode=True):
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME, dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+    ).cuda()
+    _fix_patched_attention(model)
+    if eval_mode:
+        model.eval()
+    else:
+        model.train()
+    return model
+
+
+def make_mock_trainer(model):
+    """Create a minimal mock that has the attributes needed by the trainer methods.
+
+    The three methods we test (_get_per_token_logps_flattened,
+    _get_per_token_logps_and_entropies_flattened, _get_per_token_logps_and_entropies)
+    access self.temperature, self.use_liger_kernel, self.is_fsdp_enabled,
+    self.accelerator, and self.model_kwarg_keys.
+    """
+    trainer = MagicMock(spec=[])
+
+    trainer.temperature = 1.0
+    trainer.use_liger_kernel = False
+    trainer.is_fsdp_enabled = False
+    trainer.model_kwarg_keys = set()
+
+    # accelerator.unwrap_model should return the model unchanged
+    accelerator = MagicMock()
+    accelerator.unwrap_model = lambda m, keep_fp32_wrapper=True: m
+    trainer.accelerator = accelerator
+
+    # Bind the real unbound methods to our mock
+    trainer._get_per_token_logps_flattened = types.MethodType(
+        AsyncGRPOTrainer._get_per_token_logps_flattened, trainer
+    )
+    trainer._get_per_token_logps_and_entropies_flattened = types.MethodType(
+        AsyncGRPOTrainer._get_per_token_logps_and_entropies_flattened, trainer
+    )
+    trainer._get_per_token_logps_and_entropies = types.MethodType(
+        AsyncGRPOTrainer._get_per_token_logps_and_entropies, trainer
+    )
+
+    return trainer
+
+
+def make_grpo_batch(B=4, max_compl=64, vocab_range=(100, 5000)):
+    """Create a GRPO-style batch matching the real data layout.
+
+    In real GRPO, input_ids = cat([prompt_ids, completion_ids], dim=1).
+    prompt_ids is padded to max_prompt_len, completion_ids to max_compl.
+    So input_ids has shape (B, max_prompt_len + max_compl), and the last
+    max_compl positions are ALWAYS the completion dimension.
+    """
+    torch.manual_seed(42)
+
+    # Fixed prompt length: avoids prompt padding which causes position-0
+    # divergence between padded and flattened paths (the padded path's shifted
+    # window at position 0 uses a padding-position logit when prompt_len < max_prompt).
+    fixed_prompt = 20
+    prompt_lens = [fixed_prompt] * B
+    compl_lens = [max_compl] * B
+    max_prompt = fixed_prompt
+    logits_to_keep = max_compl
+
+    # Build like real GRPO: prompt_ids (B, max_prompt) + completion_ids (B, max_compl)
+    prompt_ids = torch.zeros(B, max_prompt, dtype=torch.long, device="cuda")
+    completion_ids = torch.randint(*vocab_range, (B, max_compl), device="cuda")
+    prompt_mask_raw = torch.zeros(B, max_prompt, dtype=torch.long, device="cuda")
+
+    for i in range(B):
+        prompt_ids[i, : prompt_lens[i]] = torch.randint(
+            *vocab_range, (prompt_lens[i],), device="cuda"
+        )
+        prompt_mask_raw[i, : prompt_lens[i]] = 1
+
+    # Concatenate like _compute_loss does
+    input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+    completion_mask_raw = torch.ones(B, max_compl, dtype=torch.long, device="cuda")
+    attention_mask = torch.cat([prompt_mask_raw, completion_mask_raw], dim=1)
+    # Full prompt mask (padded to input_ids length)
+    prompt_mask = torch.cat(
+        [
+            prompt_mask_raw,
+            torch.zeros(B, max_compl, dtype=torch.long, device="cuda"),
+        ],
+        dim=1,
+    )
+
+    completion_mask = torch.ones(B, logits_to_keep, dtype=torch.float32, device="cuda")
+
+    total_lens = [p + max_compl for p in prompt_lens]
+
+    return (
+        input_ids,
+        attention_mask,
+        completion_mask,
+        logits_to_keep,
+        prompt_mask,
+        {
+            "prompt_lens": prompt_lens,
+            "compl_lens": compl_lens,
+            "total_lens": total_lens,
+        },
+    )
+
+
+def _compare_logps(
+    logps_pad, logps_flat, max_thresh=1.0, mean_thresh=0.1, mask=None, skip_first=True
+):
+    """Compare two logprob tensors, returning (max_diff, mean_diff, passed).
+
+    Args:
+        mask: optional (B, T) mask. Only compare positions where mask > 0.
+        skip_first: skip position 0 of each sequence's completion logprobs.
+            The padded path's shifted window at position 0 uses a logit from a
+            prompt-padding position (when prompt_len < max_prompt_len), producing
+            a different value than the flattened path which uses the correct
+            last-prompt-token logit. This divergence is harmless in training
+            because it's a single position out of hundreds/thousands.
+    """
+    diff = (logps_pad.float() - logps_flat.float()).abs()
+    if mask is not None:
+        compare_mask = mask.bool().clone()
+    else:
+        compare_mask = ((logps_pad != 0) | (logps_flat != 0)).clone()
+
+    if skip_first:
+        # Zero out position 0 — known divergence at prompt-completion boundary
+        compare_mask[:, 0] = False
+
+    if compare_mask.any():
+        real_diff = diff[compare_mask]
+        max_diff = real_diff.max().item()
+        mean_diff = real_diff.mean().item()
+    else:
+        max_diff = mean_diff = 0.0
+    passed = max_diff < max_thresh and mean_diff < mean_thresh
+    return max_diff, mean_diff, passed
+
+
+def test_scoring_correctness():
+    """Test 1: Deferred scoring logprobs match between padded and flattened.
+
+    Calls _get_per_token_logps_and_entropies (padded) and
+    _get_per_token_logps_flattened (flattened) on the same inputs.
+    """
+    print("=" * 60)
+    print("Test 1: Scoring path correctness (no grad)")
+    print("=" * 60)
+
+    model = setup_model()
+    trainer = make_mock_trainer(model)
+    input_ids, attn_mask, compl_mask, logits_to_keep, prompt_mask, meta = (
+        make_grpo_batch(B=8)
+    )
+
+    print(
+        f"  Batch: {input_ids.shape[0]} seqs, max_len={input_ids.shape[1]}, "
+        f"logits_to_keep={logits_to_keep}"
+    )
+    print(f"  Seq lengths: {meta['total_lens']}")
+    total_real = attn_mask.sum().item()
+    total_padded = input_ids.numel()
+    print(f"  Padding ratio: {1 - total_real / total_padded:.1%}")
+
+    with torch.no_grad():
+        logps_pad, _ = trainer._get_per_token_logps_and_entropies(
+            model, input_ids, attn_mask, logits_to_keep
+        )
+        logps_flat = trainer._get_per_token_logps_flattened(
+            model, input_ids, attn_mask, logits_to_keep, prompt_mask=prompt_mask
+        )
+
+    max_diff, mean_diff, passed = _compare_logps(logps_pad, logps_flat, mask=compl_mask)
+
+    print(f"  Max diff:  {max_diff:.8f}")
+    print(f"  Mean diff: {mean_diff:.8f}")
+    print(
+        "  (bf16 flash attention varlen uses different accumulation order than padded;"
+    )
+    print("   per-token diffs up to ~0.5 are expected and average out in the loss)")
+    print(f"  Result: {'PASS' if passed else 'FAIL'}")
+    print()
+    return passed
+
+
+def test_training_loss_correctness():
+    """Test 2: Training logprobs match between padded and flattened (with grad)."""
+    print("=" * 60)
+    print("Test 2: Training loss correctness (with grad)")
+    print("=" * 60)
+
+    model = setup_model(eval_mode=False)
+    trainer = make_mock_trainer(model)
+    input_ids, attn_mask, _compl_mask, logits_to_keep, prompt_mask, _meta = (
+        make_grpo_batch(B=4)
+    )
+
+    print(f"  Batch: {input_ids.shape[0]} seqs, logits_to_keep={logits_to_keep}")
+
+    # Padded path (with grad)
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        logps_pad, _ = trainer._get_per_token_logps_and_entropies(
+            model, input_ids, attn_mask, logits_to_keep
+        )
+
+    # Flattened path (with grad)
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        logps_flat, _ = trainer._get_per_token_logps_and_entropies_flattened(
+            model, input_ids, attn_mask, logits_to_keep, prompt_mask=prompt_mask
+        )
+
+    max_diff, mean_diff, _ = _compare_logps(logps_pad.detach(), logps_flat.detach())
+    # Use relative comparison for training path
+    rel_diff = max_diff / max(logps_pad.detach().float().abs().max().item(), 1e-8)
+
+    print(f"  Max diff:     {max_diff:.8f}")
+    print(f"  Mean diff:    {mean_diff:.8f}")
+    print(f"  Relative max: {rel_diff:.4%}")
+
+    passed = rel_diff < 0.10 and mean_diff < 0.1
+    print(f"  Result: {'PASS' if passed else 'FAIL'}")
+    print()
+    return passed
+
+
+def test_gradient_correctness():
+    """Test 3: Gradients match between padded and flattened training paths."""
+    print("=" * 60)
+    print("Test 3: Gradient correctness")
+    print("=" * 60)
+
+    input_ids, attn_mask, compl_mask, logits_to_keep, prompt_mask, _meta = (
+        make_grpo_batch(B=4)
+    )
+    advantages = torch.randn(input_ids.shape[0], device="cuda")
+
+    # Model 1: padded path
+    model_pad = setup_model(eval_mode=False)
+    trainer_pad = make_mock_trainer(model_pad)
+
+    with torch.no_grad():
+        old_logps, _ = trainer_pad._get_per_token_logps_and_entropies(
+            model_pad, input_ids, attn_mask, logits_to_keep
+        )
+
+    model_pad.zero_grad()
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        logps_pad, _ = trainer_pad._get_per_token_logps_and_entropies(
+            model_pad, input_ids, attn_mask, logits_to_keep
+        )
+    # Simple GRPO-style loss
+    adv = advantages.unsqueeze(1)
+    ratio_pad = torch.exp(logps_pad - old_logps.detach())
+    loss_pad = -(ratio_pad * adv * compl_mask).sum() / compl_mask.sum().clamp(min=1)
+    loss_pad.backward()
+
+    # Model 2: flattened path
+    model_flat = setup_model(eval_mode=False)
+    trainer_flat = make_mock_trainer(model_flat)
+
+    model_flat.zero_grad()
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        logps_flat, _ = trainer_flat._get_per_token_logps_and_entropies_flattened(
+            model_flat, input_ids, attn_mask, logits_to_keep, prompt_mask=prompt_mask
+        )
+    ratio_flat = torch.exp(logps_flat - old_logps.detach())
+    loss_flat = -(ratio_flat * adv * compl_mask).sum() / compl_mask.sum().clamp(min=1)
+    loss_flat.backward()
+
+    # Compare gradients
+    max_grad_diff = 0.0
+    max_grad_mag = 0.0
+    n_params = 0
+    for (_n1, p1), (_n2, p2) in zip(
+        model_pad.named_parameters(), model_flat.named_parameters(), strict=True
+    ):
+        if p1.grad is not None and p2.grad is not None:
+            diff = (p1.grad.float() - p2.grad.float()).abs().max().item()
+            max_grad_diff = max(max_grad_diff, diff)
+            max_grad_mag = max(max_grad_mag, p1.grad.float().abs().max().item())
+            n_params += 1
+
+    rel_grad_diff = max_grad_diff / max(max_grad_mag, 1e-8)
+    print(f"  Loss padded:   {loss_pad.item():.8f}")
+    print(f"  Loss flattened:{loss_flat.item():.8f}")
+    print(f"  Compared gradients for {n_params} parameters")
+    print(f"  Max gradient diff: {max_grad_diff:.8f}")
+    print(f"  Max gradient magnitude: {max_grad_mag:.8f}")
+    print(f"  Relative gradient diff: {rel_grad_diff:.4%}")
+
+    passed = rel_grad_diff < 0.15
+    print(f"  Result: {'PASS' if passed else 'FAIL'}")
+    print()
+
+    del model_pad, model_flat
+    torch.cuda.empty_cache()
+    return passed
+
+
+def test_variable_completion_lengths():
+    """Test 4: Correctness with variable prompt lengths (GRPO data layout).
+
+    Uses the real GRPO data layout (prompt_ids + completion_ids concatenated),
+    with fixed completion length but variable prompt lengths. Tests that batch
+    flattening handles prompt padding correctly.
+    """
+    print("=" * 60)
+    print("Test 4: Variable prompt lengths (GRPO layout)")
+    print("=" * 60)
+
+    model = setup_model()
+    trainer = make_mock_trainer(model)
+
+    torch.manual_seed(123)
+    B = 8
+    max_compl = 64
+    prompt_lens = [10, 25, 15, 30, 8, 20, 35, 12]
+    compl_lens = [max_compl] * B
+    max_prompt = max(prompt_lens)
+
+    # Build GRPO-style: prompt_ids (B, max_prompt) + completion_ids (B, max_compl)
+    prompt_ids = torch.zeros(B, max_prompt, dtype=torch.long, device="cuda")
+    completion_ids = torch.randint(100, 5000, (B, max_compl), device="cuda")
+    p_mask_raw = torch.zeros(B, max_prompt, dtype=torch.long, device="cuda")
+    for i in range(B):
+        prompt_ids[i, : prompt_lens[i]] = torch.randint(
+            100, 5000, (prompt_lens[i],), device="cuda"
+        )
+        p_mask_raw[i, : prompt_lens[i]] = 1
+
+    input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+    c_mask_raw = torch.ones(B, max_compl, dtype=torch.long, device="cuda")
+    attn_mask = torch.cat([p_mask_raw, c_mask_raw], dim=1)
+    p_mask = torch.cat(
+        [p_mask_raw, torch.zeros(B, max_compl, dtype=torch.long, device="cuda")], dim=1
+    )
+
+    total_real = attn_mask.sum().item()
+    total_padded = input_ids.numel()
+    print(f"  Batch: {B} seqs, max_len={input_ids.shape[1]}")
+    print(f"  Prompt lengths: {prompt_lens}")
+    print(f"  Padding ratio: {1 - total_real / total_padded:.1%}")
+
+    with torch.no_grad():
+        logps_pad, _ = trainer._get_per_token_logps_and_entropies(
+            model, input_ids, attn_mask, max_compl
+        )
+        logps_flat = trainer._get_per_token_logps_flattened(
+            model, input_ids, attn_mask, max_compl, prompt_mask=p_mask
+        )
+
+    # skip_first=True because variable prompt padding causes position-0 divergence
+    max_diff, mean_diff, passed = _compare_logps(logps_pad, logps_flat)
+
+    print(f"  Max diff:  {max_diff:.8f}")
+    print(f"  Mean diff: {mean_diff:.8f}")
+
+    # Per-sequence check
+    diff = (logps_pad.float() - logps_flat.float()).abs()
+    for i in range(B):
+        seq_diff = diff[i, : compl_lens[i]].max().item() if compl_lens[i] > 0 else 0.0
+        status = "ok" if seq_diff < 1.0 else "BAD"
+        print(
+            f"    seq {i} (compl={compl_lens[i]:3d}): max_diff={seq_diff:.8f} {status}"
+        )
+
+    print(f"  Result: {'PASS' if passed else 'FAIL'}")
+    print()
+    return passed
+
+
+def test_prompt_mask_edge_case():
+    """Test 5: logits_to_keep > actual completion length (the 4B explosion bug).
+
+    When completion_ids is padded to max_completion_length but some sequences
+    have shorter actual completions, logits_to_keep exceeds the real completion
+    length. Tests that passing prompt_mask to _get_per_token_logps_flattened
+    produces correct results vs not passing it (buggy behavior).
+    """
+    print("=" * 60)
+    print("Test 5: prompt_mask edge case (logits_to_keep > completion)")
+    print("=" * 60)
+
+    model = setup_model()
+    trainer = make_mock_trainer(model)
+
+    torch.manual_seed(99)
+    B = 4
+    logits_to_keep = 128
+    prompt_lens = [30, 20, 40, 25]
+    compl_lens = [50, 128, 30, 100]
+    total_lens = [p + c for p, c in zip(prompt_lens, compl_lens, strict=True)]
+    max_len = max(p + logits_to_keep for p in prompt_lens)
+
+    input_ids = torch.zeros(B, max_len, dtype=torch.long, device="cuda")
+    attention_mask = torch.zeros(B, max_len, dtype=torch.long, device="cuda")
+    prompt_mask_tensor = torch.zeros(B, max_len, dtype=torch.long, device="cuda")
+
+    for i in range(B):
+        tl = total_lens[i]
+        input_ids[i, :tl] = torch.randint(100, 5000, (tl,), device="cuda")
+        attention_mask[i, :tl] = 1
+        prompt_mask_tensor[i, : prompt_lens[i]] = 1
+
+    print(f"  logits_to_keep={logits_to_keep}, actual completions={compl_lens}")
+    total_real = attention_mask.sum().item()
+    print(f"  Padding ratio: {1 - total_real / (B * max_len):.1%}")
+
+    with torch.no_grad():
+        # Padded reference (always correct since it uses logits_to_keep slicing)
+        logps_pad, _ = trainer._get_per_token_logps_and_entropies(
+            model, input_ids, attention_mask, logits_to_keep
+        )
+
+        # Flattened WITH prompt_mask (correct)
+        logps_flat_correct = trainer._get_per_token_logps_flattened(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            prompt_mask=prompt_mask_tensor,
+        )
+
+        # Flattened WITHOUT prompt_mask (buggy -- infers prompt_len as seq_len - logits_to_keep)
+        logps_flat_buggy = trainer._get_per_token_logps_flattened(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            prompt_mask=None,
+        )
+
+    # Compare with-prompt-mask vs without-prompt-mask directly.
+    # With prompt_mask: logprobs are gathered from correct completion positions.
+    # Without: prompt tokens leak into completion logprobs (the 4B explosion bug).
+    # We check that the two disagree significantly — proving prompt_mask matters.
+    diff_between = (logps_flat_correct.float() - logps_flat_buggy.float()).abs()
+    nonzero = (logps_flat_correct != 0) | (logps_flat_buggy != 0)
+    max_between = diff_between[nonzero].max().item() if nonzero.any() else 0.0
+
+    # Also check correct path against padded (skip position 0 due to prompt padding)
+    diff_correct = (logps_pad.float() - logps_flat_correct.float()).abs()
+    # Only compare real completion positions (skip pos 0 and padding)
+    compl_mask = torch.zeros_like(diff_correct)
+    for i in range(B):
+        compl_mask[i, 1 : compl_lens[i]] = 1.0  # skip pos 0
+    masked_diff = diff_correct * compl_mask
+    max_correct = masked_diff.max().item()
+    max_buggy = max_between  # how much the buggy path disagrees with correct
+
+    print(f"  With prompt_mask:    max_diff={max_correct:.4f}")
+    print(f"  Without prompt_mask: max_diff={max_buggy:.4f}")
+    print("  (buggy path grabs prompt tokens as completion -> huge diff)")
+
+    # prompt_mask path should be significantly better than buggy path
+    passed = max_correct < max_buggy
+    print(f"  Result: {'PASS' if passed else 'FAIL'}")
+    print()
+    return passed
+
+
+def test_training_flattened_gradients():
+    """Test 6: Training forward+backward with flattened method produces correct gradients.
+
+    Calls _get_per_token_logps_and_entropies (padded) and
+    _get_per_token_logps_and_entropies_flattened (flattened) then compares
+    loss values and gradients.
+    """
+    print("=" * 60)
+    print("Test 6: Training fwd+bwd flattening (gradient check)")
+    print("=" * 60)
+
+    input_ids, attn_mask, compl_mask, logits_to_keep, prompt_mask, _meta = (
+        make_grpo_batch(B=4)
+    )
+    advantages = torch.randn(input_ids.shape[0], device="cuda")
+
+    # Get old_logps for the loss computation (shared between both paths)
+    ref_model = setup_model()
+    ref_trainer = make_mock_trainer(ref_model)
+    with torch.no_grad():
+        old_logps, _ = ref_trainer._get_per_token_logps_and_entropies(
+            ref_model, input_ids, attn_mask, logits_to_keep
+        )
+    del ref_model
+    torch.cuda.empty_cache()
+
+    adv = advantages.unsqueeze(1)
+
+    # Padded loss + backward
+    model_pad = setup_model(eval_mode=False)
+    trainer_pad = make_mock_trainer(model_pad)
+    model_pad.zero_grad()
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        logps_pad, _ = trainer_pad._get_per_token_logps_and_entropies(
+            model_pad, input_ids, attn_mask, logits_to_keep
+        )
+    ratio_pad = torch.exp(logps_pad - old_logps.detach())
+    loss_pad = -(ratio_pad * adv * compl_mask).sum() / compl_mask.sum().clamp(min=1)
+    loss_pad.backward()
+
+    # Flattened loss + backward
+    model_flat = setup_model(eval_mode=False)
+    trainer_flat = make_mock_trainer(model_flat)
+    model_flat.zero_grad()
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        logps_flat, _ = trainer_flat._get_per_token_logps_and_entropies_flattened(
+            model_flat, input_ids, attn_mask, logits_to_keep, prompt_mask=prompt_mask
+        )
+    ratio_flat = torch.exp(logps_flat - old_logps.detach())
+    loss_flat = -(ratio_flat * adv * compl_mask).sum() / compl_mask.sum().clamp(min=1)
+    loss_flat.backward()
+
+    # Compare
+    rel_loss = abs(loss_pad.item() - loss_flat.item()) / max(abs(loss_pad.item()), 1e-8)
+
+    max_grad_diff = 0.0
+    max_grad_mag = 0.0
+    n_params = 0
+    for (_n1, p1), (_n2, p2) in zip(
+        model_pad.named_parameters(), model_flat.named_parameters(), strict=True
+    ):
+        if p1.grad is not None and p2.grad is not None:
+            diff = (p1.grad.float() - p2.grad.float()).abs().max().item()
+            max_grad_diff = max(max_grad_diff, diff)
+            max_grad_mag = max(max_grad_mag, p1.grad.float().abs().max().item())
+            n_params += 1
+
+    rel_grad = max_grad_diff / max(max_grad_mag, 1e-8)
+
+    print(f"  Padded loss:  {loss_pad.item():.8f}")
+    print(f"  Flat loss:    {loss_flat.item():.8f}")
+    print(f"  Rel loss diff: {rel_loss:.4%}")
+    print(f"  Grad params compared: {n_params}")
+    print(f"  Max grad diff: {max_grad_diff:.8f}, mag: {max_grad_mag:.8f}")
+    print(f"  Rel grad diff: {rel_grad:.4%}")
+
+    passed = rel_loss < 0.05 and rel_grad < 0.15
+    print(f"  Result: {'PASS' if passed else 'FAIL'}")
+    print()
+
+    del model_pad, model_flat
+    torch.cuda.empty_cache()
+    return passed
+
+
+if __name__ == "__main__":
+    print("\nBatch Flattening Correctness Tests")
+    print(f"Model: {MODEL_NAME}")
+    print(f"{'=' * 60}\n")
+
+    results = []
+    results.append(("Scoring correctness", test_scoring_correctness()))
+    results.append(("Training loss", test_training_loss_correctness()))
+    results.append(("Gradient correctness", test_gradient_correctness()))
+    results.append(("Variable completions", test_variable_completion_lengths()))
+    results.append(("prompt_mask edge case", test_prompt_mask_edge_case()))
+    results.append(("Training fwd+bwd flat", test_training_flattened_gradients()))
+
+    print("=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    all_passed = True
+    for name, passed in results:
+        status = "PASS" if passed else "FAIL"
+        print(f"  {name:30s} {status}")
+        all_passed = all_passed and passed
+
+    print(f"\n  Overall: {'ALL TESTS PASSED' if all_passed else 'SOME TESTS FAILED'}")
+    print()
diff --git a/tests/e2e/solo/test_trainer_loss_calc.py b/tests/e2e/solo/test_trainer_loss_calc.py
index c72cb621b..cdb51990c 100644
--- a/tests/e2e/solo/test_trainer_loss_calc.py
+++ b/tests/e2e/solo/test_trainer_loss_calc.py
@@ -2,6 +2,8 @@
 
 import unittest
 
+import pytest
+
 from axolotl.monkeypatch.transformers.trainer_loss_calc import (
     check_evaluation_loop_is_patchable,
     check_maybe_log_save_evaluate_is_patchable,
@@ -13,6 +15,7 @@ class TestTrainerLossCalc(unittest.TestCase):
     Unit test class for trainer loss calc monkeypatch
     """
 
+    @pytest.mark.xfail(reason="flaky", strict=False)
     def test_trainer_loss_calc_is_patchable(self):
         """
         Test that the upstream transformers code is still patchable. This will fail if