Fix re-asking: deterministic slot memory + user-turn merge + reason-loop prompt

Historical calls showed the 8B re-asking for name/reason/phone it already had ("I already gave you my full name", the "I want an appointment" -> "what brings you in?" loop) and VAD splitting one utterance into consecutive user turns. - callstate.py: CallStateGroomer between agg.user() and the LLM. After each agent turn (off the critical path) it extracts collected slots via one short JSON-mode Ollama pass, then before each generation injects an ALREADY COLLECTED / STILL NEEDED checklist into the system message and merges VAD-fragmented consecutive user messages. Callback-type calls get an explicit "no booking questions" line. CALL_STATE_TRACKING env (auto: on for ollama, off for anthropic). - bot.py prompt step 1: "I want an appointment" is the booking intent, not the reason - ask the visit reason once, never twice. - scripts/ab_replay.py: regression harness replaying the real failed calls. llama3.1-8b raw = 3 failures; with CALL STATE = 0 failures across all scenarios (chat latency 0.31s -> 0.55s median, well under the 3s gate). Qwen3-14B A/B'd and rejected: no better raw, ~3s/turn, 11GB VRAM. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-03 23:49:39 +00:00
parent bae388420b
commit a47f4b423c
5 changed files with 445 additions and 2 deletions
--- a/scripts/ab_replay.py
+++ b/scripts/ab_replay.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""A/B replay: re-run the historical problem scenarios against candidate LLMs.
+
+Replays scripted caller turns (taken from real failed calls in the run logs) through the
+production system prompt and checks each model for the observed failure modes: re-asking
+the reason ("I want an appointment" loop), re-asking name/phone, and forcing booking
+questions (insurance/day-time) on non-booking callers. Also reports per-turn latency.
+
+Usage (inside the pipecat venv):
+    python scripts/ab_replay.py activeblue-avc:latest qwen3:14b
+    python scripts/ab_replay.py --state activeblue-avc:latest   # with CALL STATE injection
+
+--state simulates the CallStateGroomer: between turns it runs the callstate extraction
+and injects the ALREADY COLLECTED / STILL NEEDED checklist, exactly as in-call.
+"""
+import argparse
+import asyncio
+import re
+import sys
+import time
+from pathlib import Path
+
+import httpx
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from bot import SYSTEM_PROMPT  # noqa: E402  (import parses env + practice facts only)
+from callstate import build_state_block, extract_call_state  # noqa: E402
+
+OLLAMA = "http://127.0.0.1:11434"
+CALLER_LINE = (
+    "\n\nCALLER ID: the caller's number on file, written so you read it digit by digit, "
+    "is: nine seven three, five seven three, one six seven one. Near the end, state it back "
+    "and invite a correction only ('...; if that's not the best number, just let me know.') — "
+    "do NOT ask a yes/no question or wait for a 'yes'. Only change it if they give a different "
+    "number. Do not say it any earlier in the call."
+)
+GREETING = "Thank you for calling Advanced Vision Care, this is AVA. How can I help you today?"
+
+# Failure-mode detectors: (label, regex counted across assistant turns, max allowed count)
+ASK_REASON = re.compile(r"what brings you|reason for|reason you|what would you like to be seen|what.s the visit for|what seems to be", re.I)
+ASK_NAME = re.compile(r"(full |your |the )name", re.I)
+ASK_INSURANCE = re.compile(r"insurance", re.I)
+# Asking FOR a number is the failure; the statement-form readback ("I have your number
+# as ...; if that's not the best number, just let me know") is correct behavior.
+ASK_PHONE_Q = re.compile(r"(what('| i)s|can I (get|have)|may I (get|have)|could I (get|have)|give me).{0,40}(phone|number)", re.I)
+ASK_LOCATION = re.compile(r"(which|what).{0,30}(city|area|office|location)", re.I)
+
+# Scenarios distilled from real failed calls (log refs in comments). A tuple within
+# `turns` = VAD-fragmented utterance (two user messages, one reply) — from log.10 call#1.
+SCENARIOS = [
+    dict(
+        name="reason-loop (avc_run.log call#1 / log.21 call#5)",
+        turns=["I want an appointment.", "appointment", "Kendall",
+               "Carlos Garcia", "Humana", ("Monday", "3 p.m."), "No, that's all, thank you."],
+        checks=[("re-asked reason", ASK_REASON, 1),
+                ("re-asked name", ASK_NAME, 1),
+                ("re-asked location", ASK_LOCATION, 1),
+                ("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)],
+    ),
+    dict(
+        name="glasses callback (log.23/24/25)",
+        turns=["Hey, I'm a patient in Kendall and I need to know when my glasses are ready.",
+               "Carlos Garcia", "That's what I'm asking — the status of my order.",
+               "Yes, that's a good number.", "No, that's all."],
+        checks=[("asked insurance on non-booking call", ASK_INSURANCE, 0),
+                ("asked day/time on non-booking call",
+                 re.compile(r"(what|which) day|day and time|preferred (day|time)", re.I), 0),
+                ("re-asked name", ASK_NAME, 1)],
+    ),
+    dict(
+        name="early-info booking (log.4 call#1: reason+city up front)",
+        turns=["I'm having eye pain and I'm in Kendall, Florida.", "Yes please.",
+               "Carlos Garcia", "Florida Blue Medicare", ("Monday", "5 p.m."),
+               "No, that's everything."],
+        checks=[("re-asked reason", ASK_REASON, 1),
+                ("re-asked location", ASK_LOCATION, 0),   # was given in turn 1
+                ("re-asked name", ASK_NAME, 1),
+                ("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)],
+    ),
+]
+
+
+async def chat(client, model, messages, think_capable):
+    body = {
+        "model": model, "stream": False, "messages": messages,
+        "options": {"temperature": 0.3, "num_predict": 160, "num_ctx": 8192},
+    }
+    if think_capable:
+        body["think"] = False
+    t0 = time.time()
+    r = await client.post(f"{OLLAMA}/api/chat", json=body)
+    r.raise_for_status()
+    return r.json()["message"]["content"].strip(), time.time() - t0
+
+
+async def run_scenario(client, model, sc, with_state):
+    think_capable = "qwen3" in model or "deepseek-r1" in model
+    base_system = SYSTEM_PROMPT + CALLER_LINE
+    msgs = [{"role": "system", "content": base_system},
+            {"role": "assistant", "content": GREETING}]
+    lats, transcript = [], [("A", GREETING)]
+    for turn in sc["turns"]:
+        frags = turn if isinstance(turn, tuple) else (turn,)
+        for f in frags:
+            msgs.append({"role": "user", "content": f})
+            transcript.append(("C", f))
+        if with_state:
+            try:
+                state = await extract_call_state(msgs, OLLAMA, model)
+                block = build_state_block(state)
+                msgs[0]["content"] = base_system + ("\n\n" + block if block else "")
+            except Exception as e:
+                print(f"    (state extraction failed: {e})")
+        reply, dt = await chat(client, model, msgs, think_capable)
+        lats.append(dt)
+        msgs.append({"role": "assistant", "content": reply})
+        transcript.append(("A", reply))
+        if "goodbye" in reply.lower():
+            break
+    return transcript, lats
+
+
+def score(sc, transcript):
+    replies = [t for r, t in transcript if r == "A"]
+    fails = []
+    for label, rx, max_ok in sc["checks"]:
+        n = sum(1 for t in replies if rx.search(t))
+        if n > max_ok:
+            fails.append(f"{label} ({n}x, max {max_ok})")
+    return fails
+
+
+async def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("models", nargs="+")
+    ap.add_argument("--state", action="store_true", help="inject CALL STATE checklist per turn")
+    ap.add_argument("-v", "--verbose", action="store_true", help="print transcripts")
+    args = ap.parse_args()
+
+    async with httpx.AsyncClient(timeout=120) as client:
+        results = {}
+        for model in args.models:
+            print(f"\n{'='*70}\nMODEL: {model}{' + CALL STATE' if args.state else ''}\n{'='*70}")
+            total_fails, all_lats = 0, []
+            for sc in SCENARIOS:
+                transcript, lats = await run_scenario(client, model, sc, args.state)
+                fails = score(sc, transcript)
+                total_fails += len(fails)
+                all_lats += lats
+                mark = "PASS" if not fails else "FAIL: " + "; ".join(fails)
+                print(f"\n--- {sc['name']} -> {mark}")
+                if args.verbose or fails:
+                    for r, t in transcript:
+                        print(f"    {r}: {t}")
+            lat = sorted(all_lats)
+            results[model] = (total_fails, lat[len(lat)//2], lat[-1])
+            print(f"\n{model}: {total_fails} failure(s) | latency med={lat[len(lat)//2]:.2f}s max={lat[-1]:.2f}s")
+
+        print(f"\n{'='*70}\nSUMMARY{' (+state)' if args.state else ''}")
+        for m, (f, med, mx) in results.items():
+            print(f"  {m:35s} failures={f}  lat med={med:.2f}s max={mx:.2f}s")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())