Fix re-asking: deterministic slot memory + user-turn merge + reason-loop prompt
Historical calls showed the 8B re-asking for name/reason/phone it already had
("I already gave you my full name", the "I want an appointment" -> "what brings
you in?" loop) and VAD splitting one utterance into consecutive user turns.
- callstate.py: CallStateGroomer between agg.user() and the LLM. After each
agent turn (off the critical path) it extracts collected slots via one short
JSON-mode Ollama pass, then before each generation injects an ALREADY
COLLECTED / STILL NEEDED checklist into the system message and merges
VAD-fragmented consecutive user messages. Callback-type calls get an explicit
"no booking questions" line. CALL_STATE_TRACKING env (auto: on for ollama,
off for anthropic).
- bot.py prompt step 1: "I want an appointment" is the booking intent, not the
reason - ask the visit reason once, never twice.
- scripts/ab_replay.py: regression harness replaying the real failed calls.
llama3.1-8b raw = 3 failures; with CALL STATE = 0 failures across all
scenarios (chat latency 0.31s -> 0.55s median, well under the 3s gate).
Qwen3-14B A/B'd and rejected: no better raw, ~3s/turn, 11GB VRAM.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
165
scripts/ab_replay.py
Normal file
165
scripts/ab_replay.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
"""A/B replay: re-run the historical problem scenarios against candidate LLMs.
|
||||
|
||||
Replays scripted caller turns (taken from real failed calls in the run logs) through the
|
||||
production system prompt and checks each model for the observed failure modes: re-asking
|
||||
the reason ("I want an appointment" loop), re-asking name/phone, and forcing booking
|
||||
questions (insurance/day-time) on non-booking callers. Also reports per-turn latency.
|
||||
|
||||
Usage (inside the pipecat venv):
|
||||
python scripts/ab_replay.py activeblue-avc:latest qwen3:14b
|
||||
python scripts/ab_replay.py --state activeblue-avc:latest # with CALL STATE injection
|
||||
|
||||
--state simulates the CallStateGroomer: between turns it runs the callstate extraction
|
||||
and injects the ALREADY COLLECTED / STILL NEEDED checklist, exactly as in-call.
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from bot import SYSTEM_PROMPT # noqa: E402 (import parses env + practice facts only)
|
||||
from callstate import build_state_block, extract_call_state # noqa: E402
|
||||
|
||||
OLLAMA = "http://127.0.0.1:11434"
|
||||
CALLER_LINE = (
|
||||
"\n\nCALLER ID: the caller's number on file, written so you read it digit by digit, "
|
||||
"is: nine seven three, five seven three, one six seven one. Near the end, state it back "
|
||||
"and invite a correction only ('...; if that's not the best number, just let me know.') — "
|
||||
"do NOT ask a yes/no question or wait for a 'yes'. Only change it if they give a different "
|
||||
"number. Do not say it any earlier in the call."
|
||||
)
|
||||
GREETING = "Thank you for calling Advanced Vision Care, this is AVA. How can I help you today?"
|
||||
|
||||
# Failure-mode detectors: (label, regex counted across assistant turns, max allowed count)
|
||||
ASK_REASON = re.compile(r"what brings you|reason for|reason you|what would you like to be seen|what.s the visit for|what seems to be", re.I)
|
||||
ASK_NAME = re.compile(r"(full |your |the )name", re.I)
|
||||
ASK_INSURANCE = re.compile(r"insurance", re.I)
|
||||
# Asking FOR a number is the failure; the statement-form readback ("I have your number
|
||||
# as ...; if that's not the best number, just let me know") is correct behavior.
|
||||
ASK_PHONE_Q = re.compile(r"(what('| i)s|can I (get|have)|may I (get|have)|could I (get|have)|give me).{0,40}(phone|number)", re.I)
|
||||
ASK_LOCATION = re.compile(r"(which|what).{0,30}(city|area|office|location)", re.I)
|
||||
|
||||
# Scenarios distilled from real failed calls (log refs in comments). A tuple within
|
||||
# `turns` = VAD-fragmented utterance (two user messages, one reply) — from log.10 call#1.
|
||||
SCENARIOS = [
|
||||
dict(
|
||||
name="reason-loop (avc_run.log call#1 / log.21 call#5)",
|
||||
turns=["I want an appointment.", "appointment", "Kendall",
|
||||
"Carlos Garcia", "Humana", ("Monday", "3 p.m."), "No, that's all, thank you."],
|
||||
checks=[("re-asked reason", ASK_REASON, 1),
|
||||
("re-asked name", ASK_NAME, 1),
|
||||
("re-asked location", ASK_LOCATION, 1),
|
||||
("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)],
|
||||
),
|
||||
dict(
|
||||
name="glasses callback (log.23/24/25)",
|
||||
turns=["Hey, I'm a patient in Kendall and I need to know when my glasses are ready.",
|
||||
"Carlos Garcia", "That's what I'm asking — the status of my order.",
|
||||
"Yes, that's a good number.", "No, that's all."],
|
||||
checks=[("asked insurance on non-booking call", ASK_INSURANCE, 0),
|
||||
("asked day/time on non-booking call",
|
||||
re.compile(r"(what|which) day|day and time|preferred (day|time)", re.I), 0),
|
||||
("re-asked name", ASK_NAME, 1)],
|
||||
),
|
||||
dict(
|
||||
name="early-info booking (log.4 call#1: reason+city up front)",
|
||||
turns=["I'm having eye pain and I'm in Kendall, Florida.", "Yes please.",
|
||||
"Carlos Garcia", "Florida Blue Medicare", ("Monday", "5 p.m."),
|
||||
"No, that's everything."],
|
||||
checks=[("re-asked reason", ASK_REASON, 1),
|
||||
("re-asked location", ASK_LOCATION, 0), # was given in turn 1
|
||||
("re-asked name", ASK_NAME, 1),
|
||||
("asked for phone (has caller-ID)", ASK_PHONE_Q, 0)],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
async def chat(client, model, messages, think_capable):
|
||||
body = {
|
||||
"model": model, "stream": False, "messages": messages,
|
||||
"options": {"temperature": 0.3, "num_predict": 160, "num_ctx": 8192},
|
||||
}
|
||||
if think_capable:
|
||||
body["think"] = False
|
||||
t0 = time.time()
|
||||
r = await client.post(f"{OLLAMA}/api/chat", json=body)
|
||||
r.raise_for_status()
|
||||
return r.json()["message"]["content"].strip(), time.time() - t0
|
||||
|
||||
|
||||
async def run_scenario(client, model, sc, with_state):
|
||||
think_capable = "qwen3" in model or "deepseek-r1" in model
|
||||
base_system = SYSTEM_PROMPT + CALLER_LINE
|
||||
msgs = [{"role": "system", "content": base_system},
|
||||
{"role": "assistant", "content": GREETING}]
|
||||
lats, transcript = [], [("A", GREETING)]
|
||||
for turn in sc["turns"]:
|
||||
frags = turn if isinstance(turn, tuple) else (turn,)
|
||||
for f in frags:
|
||||
msgs.append({"role": "user", "content": f})
|
||||
transcript.append(("C", f))
|
||||
if with_state:
|
||||
try:
|
||||
state = await extract_call_state(msgs, OLLAMA, model)
|
||||
block = build_state_block(state)
|
||||
msgs[0]["content"] = base_system + ("\n\n" + block if block else "")
|
||||
except Exception as e:
|
||||
print(f" (state extraction failed: {e})")
|
||||
reply, dt = await chat(client, model, msgs, think_capable)
|
||||
lats.append(dt)
|
||||
msgs.append({"role": "assistant", "content": reply})
|
||||
transcript.append(("A", reply))
|
||||
if "goodbye" in reply.lower():
|
||||
break
|
||||
return transcript, lats
|
||||
|
||||
|
||||
def score(sc, transcript):
|
||||
replies = [t for r, t in transcript if r == "A"]
|
||||
fails = []
|
||||
for label, rx, max_ok in sc["checks"]:
|
||||
n = sum(1 for t in replies if rx.search(t))
|
||||
if n > max_ok:
|
||||
fails.append(f"{label} ({n}x, max {max_ok})")
|
||||
return fails
|
||||
|
||||
|
||||
async def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("models", nargs="+")
|
||||
ap.add_argument("--state", action="store_true", help="inject CALL STATE checklist per turn")
|
||||
ap.add_argument("-v", "--verbose", action="store_true", help="print transcripts")
|
||||
args = ap.parse_args()
|
||||
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
results = {}
|
||||
for model in args.models:
|
||||
print(f"\n{'='*70}\nMODEL: {model}{' + CALL STATE' if args.state else ''}\n{'='*70}")
|
||||
total_fails, all_lats = 0, []
|
||||
for sc in SCENARIOS:
|
||||
transcript, lats = await run_scenario(client, model, sc, args.state)
|
||||
fails = score(sc, transcript)
|
||||
total_fails += len(fails)
|
||||
all_lats += lats
|
||||
mark = "PASS" if not fails else "FAIL: " + "; ".join(fails)
|
||||
print(f"\n--- {sc['name']} -> {mark}")
|
||||
if args.verbose or fails:
|
||||
for r, t in transcript:
|
||||
print(f" {r}: {t}")
|
||||
lat = sorted(all_lats)
|
||||
results[model] = (total_fails, lat[len(lat)//2], lat[-1])
|
||||
print(f"\n{model}: {total_fails} failure(s) | latency med={lat[len(lat)//2]:.2f}s max={lat[-1]:.2f}s")
|
||||
|
||||
print(f"\n{'='*70}\nSUMMARY{' (+state)' if args.state else ''}")
|
||||
for m, (f, med, mx) in results.items():
|
||||
print(f" {m:35s} failures={f} lat med={med:.2f}s max={mx:.2f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user