#!/usr/bin/env python3 """A/B replay: re-run the historical problem scenarios against candidate LLMs. Replays scripted caller turns (taken from real failed calls in the run logs) through the production system prompt and checks each model for the observed failure modes: re-asking the reason ("I want an appointment" loop), re-asking name/phone, and forcing booking questions (insurance/day-time) on non-booking callers. Also reports per-turn latency. Usage (inside the pipecat venv): python scripts/ab_replay.py activeblue-avc:latest qwen3:14b python scripts/ab_replay.py --state activeblue-avc:latest # with CALL STATE injection --state simulates the CallStateGroomer: between turns it runs the callstate extraction and injects the ALREADY COLLECTED / STILL NEEDED checklist, exactly as in-call. """ import argparse import asyncio import re import sys import time from pathlib import Path import httpx sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from bot import SYSTEM_PROMPT # noqa: E402 (import parses env + practice facts only) from callstate import build_state_block, extract_call_state # noqa: E402 OLLAMA = "http://127.0.0.1:11434" CALLER_LINE = ( "\n\nCALLER ID: the caller's number on file, written so you read it digit by digit, " "is: nine seven three, five seven three, one six seven one. Near the end, state it back " "and invite a correction only ('...; if that's not the best number, just let me know.') — " "do NOT ask a yes/no question or wait for a 'yes'. Only change it if they give a different " "number. Do not say it any earlier in the call." ) GREETING = "Thank you for calling Advanced Vision Care, this is AVA. How can I help you today?" # Failure-mode detectors: (label, regex counted across assistant turns, max allowed count) ASK_REASON = re.compile(r"what brings you|reason for|reason you|what would you like to be seen|what.s the visit for|what seems to be", re.I) ASK_NAME = re.compile(r"(full |your |the )name", re.I) ASK_INSURANCE = re.compile(r"insurance", re.I) # Asking FOR a number is the failure; the statement-form readback ("I have your number # as ...; if that's not the best number, just let me know") is correct behavior. ASK_PHONE_Q = re.compile(r"(what('| i)s|can I (get|have)|may I (get|have)|could I (get|have)|give me).{0,40}(phone|number)", re.I) ASK_LOCATION = re.compile(r"(which|what).{0,30}(city|area|office|location)", re.I) # Scenarios distilled from real failed calls (log refs in comments). A tuple within # `turns` = VAD-fragmented utterance (two user messages, one reply) — from log.10 call#1. SCENARIOS = [ dict( name="reason-loop (avc_run.log call#1 / log.21 call#5)", turns=["I want an appointment.", "appointment", "Kendall", "Carlos Garcia", "Humana", ("Monday", "3 p.m."), "No, that's all, thank you."], checks=[("re-asked reason", ASK_REASON, 1), ("re-asked name", ASK_NAME, 1), ("re-asked location", ASK_LOCATION, 1), ("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)], ), dict( name="glasses callback (log.23/24/25)", turns=["Hey, I'm a patient in Kendall and I need to know when my glasses are ready.", "Carlos Garcia", "That's what I'm asking — the status of my order.", "Yes, that's a good number.", "No, that's all."], checks=[("asked insurance on non-booking call", ASK_INSURANCE, 0), ("asked day/time on non-booking call", re.compile(r"(what|which) day|day and time|preferred (day|time)", re.I), 0), ("re-asked name", ASK_NAME, 1)], ), dict( name="early-info booking (log.4 call#1: reason+city up front)", turns=["I'm having eye pain and I'm in Kendall, Florida.", "Yes please.", "Carlos Garcia", "Florida Blue Medicare", ("Monday", "5 p.m."), "No, that's everything."], checks=[("re-asked reason", ASK_REASON, 1), ("re-asked location", ASK_LOCATION, 0), # was given in turn 1 ("re-asked name", ASK_NAME, 1), ("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)], ), ] async def chat(client, model, messages, think_capable): body = { "model": model, "stream": False, "messages": messages, "options": {"temperature": 0.3, "num_predict": 160, "num_ctx": 8192}, } if think_capable: body["think"] = False t0 = time.time() r = await client.post(f"{OLLAMA}/api/chat", json=body) r.raise_for_status() return r.json()["message"]["content"].strip(), time.time() - t0 async def run_scenario(client, model, sc, with_state): think_capable = "qwen3" in model or "deepseek-r1" in model base_system = SYSTEM_PROMPT + CALLER_LINE msgs = [{"role": "system", "content": base_system}, {"role": "assistant", "content": GREETING}] lats, transcript = [], [("A", GREETING)] for turn in sc["turns"]: frags = turn if isinstance(turn, tuple) else (turn,) for f in frags: msgs.append({"role": "user", "content": f}) transcript.append(("C", f)) if with_state: try: state = await extract_call_state(msgs, OLLAMA, model) block = build_state_block(state) msgs[0]["content"] = base_system + ("\n\n" + block if block else "") except Exception as e: print(f" (state extraction failed: {e})") reply, dt = await chat(client, model, msgs, think_capable) lats.append(dt) msgs.append({"role": "assistant", "content": reply}) transcript.append(("A", reply)) if "goodbye" in reply.lower(): break return transcript, lats def score(sc, transcript): replies = [t for r, t in transcript if r == "A"] fails = [] for label, rx, max_ok in sc["checks"]: n = sum(1 for t in replies if rx.search(t)) if n > max_ok: fails.append(f"{label} ({n}x, max {max_ok})") return fails async def main(): ap = argparse.ArgumentParser() ap.add_argument("models", nargs="+") ap.add_argument("--state", action="store_true", help="inject CALL STATE checklist per turn") ap.add_argument("-v", "--verbose", action="store_true", help="print transcripts") args = ap.parse_args() async with httpx.AsyncClient(timeout=120) as client: results = {} for model in args.models: print(f"\n{'='*70}\nMODEL: {model}{' + CALL STATE' if args.state else ''}\n{'='*70}") total_fails, all_lats = 0, [] for sc in SCENARIOS: transcript, lats = await run_scenario(client, model, sc, args.state) fails = score(sc, transcript) total_fails += len(fails) all_lats += lats mark = "PASS" if not fails else "FAIL: " + "; ".join(fails) print(f"\n--- {sc['name']} -> {mark}") if args.verbose or fails: for r, t in transcript: print(f" {r}: {t}") lat = sorted(all_lats) results[model] = (total_fails, lat[len(lat)//2], lat[-1]) print(f"\n{model}: {total_fails} failure(s) | latency med={lat[len(lat)//2]:.2f}s max={lat[-1]:.2f}s") print(f"\n{'='*70}\nSUMMARY{' (+state)' if args.state else ''}") for m, (f, med, mx) in results.items(): print(f" {m:35s} failures={f} lat med={med:.2f}s max={mx:.2f}s") if __name__ == "__main__": asyncio.run(main())