Files
avc-phone-ai/scripts/score_calls.py
tocmo0nlord fbea2247c6 score_calls: break calls out by outcome type
Parse the per-call outcome (appointment / callback / none / skipped / incomplete)
from the new "Post-call <kind> saved" / "no actionable request" / "skipping card"
log lines. Adds a per-call type column, a "By outcome type" tally, and splits the
leads count into appointment + callback — so a mixed test batch is easy to verify.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 14:52:03 +00:00

94 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""Score recent calls against the Phase 1 gate from the server log.
Usage: python scripts/score_calls.py [path/to/avc_run.log]
Reads the loguru server log, groups events per call, and reports for each call +
an aggregate: outcome type (appointment / callback / none / skipped), caller turns,
true model latency (LLM-start -> first TTS), AVC-side termination, lead written,
watchdog re-prompts, and errors. Pairs with the stereo WAVs in recordings/.
"""
import re
import sys
from collections import Counter
from datetime import datetime
LOG = sys.argv[1] if len(sys.argv) > 1 else "avc_run.log"
TS = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
SAVED = re.compile(r"Post-call (\w+) saved \(([^)]*)\): (.*)")
def t(line):
m = TS.match(line)
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S.%f") if m else None
calls, cur = [], None
err = 0
for line in open(LOG):
if "Media stream start" in line:
cur = {"sid": (re.search(r"call=(CA\w+)", line) or [None, "?"])[1],
"stt": [], "llm_start": None, "lat": [], "closing": False,
"hangup": False, "type": None, "lead": None, "reprompts": 0}
calls.append(cur)
if cur is None:
continue
if re.search(r"Traceback|ERROR|run_tts\(\) takes|Exception", line):
err += 1
if "run_stt:380 - Transcription:" in line:
cur["stt"].append(re.search(r"\[(.*)\]", line).group(1).strip())
elif "get_chat_completions:296" in line:
cur["llm_start"] = t(line)
elif "run_tts:213" in line and "Generating TTS" in line and cur["llm_start"]:
d = (t(line) - cur["llm_start"]).total_seconds()
if 0 <= d < 60:
cur["lat"].append(d)
cur["llm_start"] = None
elif "signalled closing" in line:
cur["closing"] = True
elif "ending task / hanging up" in line:
cur["hangup"] = True
elif "re-prompt #" in line:
cur["reprompts"] += 1
elif "no actionable request" in line:
cur["type"] = "none"
elif "skipping card" in line:
cur["type"] = "skipped"
else:
m = SAVED.search(line)
if m:
cur["type"] = m.group(1) # appointment | callback
cur["lead"] = f"[{m.group(2)}] {m.group(3)}"
all_lat = []
print(f"\n=== Call scorecard ({LOG}) — {len(calls)} call(s) ===\n")
for i, c in enumerate(calls, 1):
turns = len([s for s in c["stt"] if s])
lat = sorted(c["lat"])
p95 = lat[min(len(lat) - 1, int(0.95 * len(lat)))] if lat else 0
all_lat += c["lat"]
print(f"Call {i} {c['sid']}: type={c['type'] or '':11s} turns={turns} | "
f"lat med={(lat[len(lat)//2] if lat else 0):.2f}s p95={p95:.2f}s | "
f"hangup={'Y' if c['hangup'] else '-'} reprompts={c['reprompts']} | "
f"lead={'Y' if c['lead'] else 'no'}")
if c["lead"]:
print(f" {c['lead']}")
al = sorted(all_lat)
p95 = al[min(len(al) - 1, int(0.95 * len(al)))] if al else 0
types = Counter(c["type"] or "incomplete" for c in calls)
print("\n=== By outcome type ===")
for k in ("appointment", "callback", "none", "skipped", "incomplete"):
if types.get(k):
print(f" {k:11s}: {types[k]}")
print("\n=== Phase 1 gate ===")
print(f" calls placed : {len(calls)} (gate wants 10 consecutive clean)")
print(f" errors / tracebacks : {err} ({'PASS' if err == 0 else 'FAIL'})")
print(f" AVC-side hangups : {sum(c['hangup'] for c in calls)}/{len(calls)}")
print(f" leads written : {sum(1 for c in calls if c['lead'])}/{len(calls)} "
f"(appointment {types.get('appointment', 0)} + callback {types.get('callback', 0)})")
print(f" watchdog re-prompts (silence): {sum(c['reprompts'] for c in calls)}")
print(f" latency P95 (LLM->TTS) : {p95:.2f}s ({'PASS <3s' if p95 < 3 else 'CHECK'})")
print(f"\n recordings/ has the stereo WAVs (caller=L, agent=R) for audio review.")