Fix re-asking: deterministic slot memory + user-turn merge + reason-loop prompt

Historical calls showed the 8B re-asking for name/reason/phone it already had
("I already gave you my full name", the "I want an appointment" -> "what brings
you in?" loop) and VAD splitting one utterance into consecutive user turns.

- callstate.py: CallStateGroomer between agg.user() and the LLM. After each
  agent turn (off the critical path) it extracts collected slots via one short
  JSON-mode Ollama pass, then before each generation injects an ALREADY
  COLLECTED / STILL NEEDED checklist into the system message and merges
  VAD-fragmented consecutive user messages. Callback-type calls get an explicit
  "no booking questions" line. CALL_STATE_TRACKING env (auto: on for ollama,
  off for anthropic).
- bot.py prompt step 1: "I want an appointment" is the booking intent, not the
  reason - ask the visit reason once, never twice.
- scripts/ab_replay.py: regression harness replaying the real failed calls.
  llama3.1-8b raw = 3 failures; with CALL STATE = 0 failures across all
  scenarios (chat latency 0.31s -> 0.55s median, well under the 3s gate).
  Qwen3-14B A/B'd and rejected: no better raw, ~3s/turn, 11GB VRAM.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
tocmo0nlord
2026-07-03 23:49:39 +00:00
parent bae388420b
commit a47f4b423c
5 changed files with 445 additions and 2 deletions

165
scripts/ab_replay.py Normal file
View File

@@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""A/B replay: re-run the historical problem scenarios against candidate LLMs.
Replays scripted caller turns (taken from real failed calls in the run logs) through the
production system prompt and checks each model for the observed failure modes: re-asking
the reason ("I want an appointment" loop), re-asking name/phone, and forcing booking
questions (insurance/day-time) on non-booking callers. Also reports per-turn latency.
Usage (inside the pipecat venv):
python scripts/ab_replay.py activeblue-avc:latest qwen3:14b
python scripts/ab_replay.py --state activeblue-avc:latest # with CALL STATE injection
--state simulates the CallStateGroomer: between turns it runs the callstate extraction
and injects the ALREADY COLLECTED / STILL NEEDED checklist, exactly as in-call.
"""
import argparse
import asyncio
import re
import sys
import time
from pathlib import Path
import httpx
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from bot import SYSTEM_PROMPT # noqa: E402 (import parses env + practice facts only)
from callstate import build_state_block, extract_call_state # noqa: E402
OLLAMA = "http://127.0.0.1:11434"
CALLER_LINE = (
"\n\nCALLER ID: the caller's number on file, written so you read it digit by digit, "
"is: nine seven three, five seven three, one six seven one. Near the end, state it back "
"and invite a correction only ('...; if that's not the best number, just let me know.') — "
"do NOT ask a yes/no question or wait for a 'yes'. Only change it if they give a different "
"number. Do not say it any earlier in the call."
)
GREETING = "Thank you for calling Advanced Vision Care, this is AVA. How can I help you today?"
# Failure-mode detectors: (label, regex counted across assistant turns, max allowed count)
ASK_REASON = re.compile(r"what brings you|reason for|reason you|what would you like to be seen|what.s the visit for|what seems to be", re.I)
ASK_NAME = re.compile(r"(full |your |the )name", re.I)
ASK_INSURANCE = re.compile(r"insurance", re.I)
# Asking FOR a number is the failure; the statement-form readback ("I have your number
# as ...; if that's not the best number, just let me know") is correct behavior.
ASK_PHONE_Q = re.compile(r"(what('| i)s|can I (get|have)|may I (get|have)|could I (get|have)|give me).{0,40}(phone|number)", re.I)
ASK_LOCATION = re.compile(r"(which|what).{0,30}(city|area|office|location)", re.I)
# Scenarios distilled from real failed calls (log refs in comments). A tuple within
# `turns` = VAD-fragmented utterance (two user messages, one reply) — from log.10 call#1.
SCENARIOS = [
dict(
name="reason-loop (avc_run.log call#1 / log.21 call#5)",
turns=["I want an appointment.", "appointment", "Kendall",
"Carlos Garcia", "Humana", ("Monday", "3 p.m."), "No, that's all, thank you."],
checks=[("re-asked reason", ASK_REASON, 1),
("re-asked name", ASK_NAME, 1),
("re-asked location", ASK_LOCATION, 1),
("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)],
),
dict(
name="glasses callback (log.23/24/25)",
turns=["Hey, I'm a patient in Kendall and I need to know when my glasses are ready.",
"Carlos Garcia", "That's what I'm asking — the status of my order.",
"Yes, that's a good number.", "No, that's all."],
checks=[("asked insurance on non-booking call", ASK_INSURANCE, 0),
("asked day/time on non-booking call",
re.compile(r"(what|which) day|day and time|preferred (day|time)", re.I), 0),
("re-asked name", ASK_NAME, 1)],
),
dict(
name="early-info booking (log.4 call#1: reason+city up front)",
turns=["I'm having eye pain and I'm in Kendall, Florida.", "Yes please.",
"Carlos Garcia", "Florida Blue Medicare", ("Monday", "5 p.m."),
"No, that's everything."],
checks=[("re-asked reason", ASK_REASON, 1),
("re-asked location", ASK_LOCATION, 0), # was given in turn 1
("re-asked name", ASK_NAME, 1),
("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)],
),
]
async def chat(client, model, messages, think_capable):
body = {
"model": model, "stream": False, "messages": messages,
"options": {"temperature": 0.3, "num_predict": 160, "num_ctx": 8192},
}
if think_capable:
body["think"] = False
t0 = time.time()
r = await client.post(f"{OLLAMA}/api/chat", json=body)
r.raise_for_status()
return r.json()["message"]["content"].strip(), time.time() - t0
async def run_scenario(client, model, sc, with_state):
think_capable = "qwen3" in model or "deepseek-r1" in model
base_system = SYSTEM_PROMPT + CALLER_LINE
msgs = [{"role": "system", "content": base_system},
{"role": "assistant", "content": GREETING}]
lats, transcript = [], [("A", GREETING)]
for turn in sc["turns"]:
frags = turn if isinstance(turn, tuple) else (turn,)
for f in frags:
msgs.append({"role": "user", "content": f})
transcript.append(("C", f))
if with_state:
try:
state = await extract_call_state(msgs, OLLAMA, model)
block = build_state_block(state)
msgs[0]["content"] = base_system + ("\n\n" + block if block else "")
except Exception as e:
print(f" (state extraction failed: {e})")
reply, dt = await chat(client, model, msgs, think_capable)
lats.append(dt)
msgs.append({"role": "assistant", "content": reply})
transcript.append(("A", reply))
if "goodbye" in reply.lower():
break
return transcript, lats
def score(sc, transcript):
replies = [t for r, t in transcript if r == "A"]
fails = []
for label, rx, max_ok in sc["checks"]:
n = sum(1 for t in replies if rx.search(t))
if n > max_ok:
fails.append(f"{label} ({n}x, max {max_ok})")
return fails
async def main():
ap = argparse.ArgumentParser()
ap.add_argument("models", nargs="+")
ap.add_argument("--state", action="store_true", help="inject CALL STATE checklist per turn")
ap.add_argument("-v", "--verbose", action="store_true", help="print transcripts")
args = ap.parse_args()
async with httpx.AsyncClient(timeout=120) as client:
results = {}
for model in args.models:
print(f"\n{'='*70}\nMODEL: {model}{' + CALL STATE' if args.state else ''}\n{'='*70}")
total_fails, all_lats = 0, []
for sc in SCENARIOS:
transcript, lats = await run_scenario(client, model, sc, args.state)
fails = score(sc, transcript)
total_fails += len(fails)
all_lats += lats
mark = "PASS" if not fails else "FAIL: " + "; ".join(fails)
print(f"\n--- {sc['name']} -> {mark}")
if args.verbose or fails:
for r, t in transcript:
print(f" {r}: {t}")
lat = sorted(all_lats)
results[model] = (total_fails, lat[len(lat)//2], lat[-1])
print(f"\n{model}: {total_fails} failure(s) | latency med={lat[len(lat)//2]:.2f}s max={lat[-1]:.2f}s")
print(f"\n{'='*70}\nSUMMARY{' (+state)' if args.state else ''}")
for m, (f, med, mx) in results.items():
print(f" {m:35s} failures={f} lat med={med:.2f}s max={mx:.2f}s")
if __name__ == "__main__":
asyncio.run(main())