From fbea2247c6d738ec9816933d2f804a2ac3a81b35 Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Mon, 29 Jun 2026 14:52:03 +0000 Subject: [PATCH] score_calls: break calls out by outcome type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parse the per-call outcome (appointment / callback / none / skipped / incomplete) from the new "Post-call saved" / "no actionable request" / "skipping card" log lines. Adds a per-call type column, a "By outcome type" tally, and splits the leads count into appointment + callback — so a mixed test batch is easy to verify. Co-Authored-By: Claude Opus 4.8 --- scripts/score_calls.py | 45 ++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/scripts/score_calls.py b/scripts/score_calls.py index 68a14ee..63b3d7d 100755 --- a/scripts/score_calls.py +++ b/scripts/score_calls.py @@ -4,16 +4,18 @@ Usage: python scripts/score_calls.py [path/to/avc_run.log] Reads the loguru server log, groups events per call, and reports for each call + -an aggregate: caller turns, silent non-responses, true model latency (LLM-start -> -first TTS), AVC-side termination, post-call lead written, watchdog re-prompts, and -errors. Pairs with the stereo WAVs in recordings/ for audio review. +an aggregate: outcome type (appointment / callback / none / skipped), caller turns, +true model latency (LLM-start -> first TTS), AVC-side termination, lead written, +watchdog re-prompts, and errors. Pairs with the stereo WAVs in recordings/. """ import re import sys +from collections import Counter from datetime import datetime LOG = sys.argv[1] if len(sys.argv) > 1 else "avc_run.log" TS = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") +SAVED = re.compile(r"Post-call (\w+) saved \(([^)]*)\): (.*)") def t(line): @@ -26,15 +28,15 @@ err = 0 for line in open(LOG): if "Media stream start" in line: cur = {"sid": (re.search(r"call=(CA\w+)", line) or [None, "?"])[1], - "t0": t(line), "stt": [], "llm_start": None, "lat": [], - "closing": False, "hangup": False, "lead": None, "reprompts": 0} + "stt": [], "llm_start": None, "lat": [], "closing": False, + "hangup": False, "type": None, "lead": None, "reprompts": 0} calls.append(cur) if cur is None: continue if re.search(r"Traceback|ERROR|run_tts\(\) takes|Exception", line): err += 1 if "run_stt:380 - Transcription:" in line: - cur["stt"].append((t(line), re.search(r"\[(.*)\]", line).group(1).strip())) + cur["stt"].append(re.search(r"\[(.*)\]", line).group(1).strip()) elif "get_chat_completions:296" in line: cur["llm_start"] = t(line) elif "run_tts:213" in line and "Generating TTS" in line and cur["llm_start"]: @@ -48,31 +50,44 @@ for line in open(LOG): cur["hangup"] = True elif "re-prompt #" in line: cur["reprompts"] += 1 - elif "Post-call appointment saved" in line: - cur["lead"] = line.split(" - ")[-1].strip() + elif "no actionable request" in line: + cur["type"] = "none" + elif "skipping card" in line: + cur["type"] = "skipped" + else: + m = SAVED.search(line) + if m: + cur["type"] = m.group(1) # appointment | callback + cur["lead"] = f"[{m.group(2)}] {m.group(3)}" all_lat = [] -nonresp_total = 0 print(f"\n=== Call scorecard ({LOG}) — {len(calls)} call(s) ===\n") for i, c in enumerate(calls, 1): - turns = len([s for s in c["stt"] if s[1]]) + turns = len([s for s in c["stt"] if s]) lat = sorted(c["lat"]) p95 = lat[min(len(lat) - 1, int(0.95 * len(lat)))] if lat else 0 all_lat += c["lat"] - print(f"Call {i} {c['sid']}: turns={turns} | " - f"latency med={ (lat[len(lat)//2] if lat else 0):.2f}s p95={p95:.2f}s | " - f"closing={'Y' if c['closing'] else '-'} hangup={'Y' if c['hangup'] else '-'} | " - f"reprompts={c['reprompts']} | lead={'Y' if c['lead'] else 'no'}") + print(f"Call {i} {c['sid']}: type={c['type'] or '—':11s} turns={turns} | " + f"lat med={(lat[len(lat)//2] if lat else 0):.2f}s p95={p95:.2f}s | " + f"hangup={'Y' if c['hangup'] else '-'} reprompts={c['reprompts']} | " + f"lead={'Y' if c['lead'] else 'no'}") if c["lead"]: print(f" {c['lead']}") al = sorted(all_lat) p95 = al[min(len(al) - 1, int(0.95 * len(al)))] if al else 0 +types = Counter(c["type"] or "incomplete" for c in calls) +print("\n=== By outcome type ===") +for k in ("appointment", "callback", "none", "skipped", "incomplete"): + if types.get(k): + print(f" {k:11s}: {types[k]}") + print("\n=== Phase 1 gate ===") print(f" calls placed : {len(calls)} (gate wants 10 consecutive clean)") print(f" errors / tracebacks : {err} ({'PASS' if err == 0 else 'FAIL'})") print(f" AVC-side hangups : {sum(c['hangup'] for c in calls)}/{len(calls)}") -print(f" leads written : {sum(1 for c in calls if c['lead'])}/{len(calls)}") +print(f" leads written : {sum(1 for c in calls if c['lead'])}/{len(calls)} " + f"(appointment {types.get('appointment', 0)} + callback {types.get('callback', 0)})") print(f" watchdog re-prompts (silence): {sum(c['reprompts'] for c in calls)}") print(f" latency P95 (LLM->TTS) : {p95:.2f}s ({'PASS <3s' if p95 < 3 else 'CHECK'})") print(f"\n recordings/ has the stereo WAVs (caller=L, agent=R) for audio review.")