score_calls: break calls out by outcome type

Parse the per-call outcome (appointment / callback / none / skipped / incomplete) from the new "Post-call <kind> saved" / "no actionable request" / "skipping card" log lines. Adds a per-call type column, a "By outcome type" tally, and splits the leads count into appointment + callback — so a mixed test batch is easy to verify. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 14:52:03 +00:00
parent 97e109ed89
commit fbea2247c6
1 changed files with 30 additions and 15 deletions
--- a/scripts/score_calls.py
+++ b/scripts/score_calls.py
@@ -4,16 +4,18 @@
 Usage: python scripts/score_calls.py [path/to/avc_run.log]

 Reads the loguru server log, groups events per call, and reports for each call +
-an aggregate: caller turns, silent non-responses, true model latency (LLM-start ->
-first TTS), AVC-side termination, post-call lead written, watchdog re-prompts, and
-errors. Pairs with the stereo WAVs in recordings/ for audio review.
+an aggregate: outcome type (appointment / callback / none / skipped), caller turns,
+true model latency (LLM-start -> first TTS), AVC-side termination, lead written,
+watchdog re-prompts, and errors. Pairs with the stereo WAVs in recordings/.
 """
 import re
 import sys
+from collections import Counter
 from datetime import datetime

 LOG = sys.argv[1] if len(sys.argv) > 1 else "avc_run.log"
 TS = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
+SAVED = re.compile(r"Post-call (\w+) saved \(([^)]*)\): (.*)")


 def t(line):
@@ -26,15 +28,15 @@ err = 0
 for line in open(LOG):
    if "Media stream start" in line:
        cur = {"sid": (re.search(r"call=(CA\w+)", line) or [None, "?"])[1],
-               "t0": t(line), "stt": [], "llm_start": None, "lat": [],
-               "closing": False, "hangup": False, "lead": None, "reprompts": 0}
+               "stt": [], "llm_start": None, "lat": [], "closing": False,
+               "hangup": False, "type": None, "lead": None, "reprompts": 0}
        calls.append(cur)
    if cur is None:
        continue
    if re.search(r"Traceback|ERROR|run_tts\(\) takes|Exception", line):
        err += 1
    if "run_stt:380 - Transcription:" in line:
-        cur["stt"].append((t(line), re.search(r"\[(.*)\]", line).group(1).strip()))
+        cur["stt"].append(re.search(r"\[(.*)\]", line).group(1).strip())
    elif "get_chat_completions:296" in line:
        cur["llm_start"] = t(line)
    elif "run_tts:213" in line and "Generating TTS" in line and cur["llm_start"]:
@@ -48,31 +50,44 @@ for line in open(LOG):
        cur["hangup"] = True
    elif "re-prompt #" in line:
        cur["reprompts"] += 1
-    elif "Post-call appointment saved" in line:
-        cur["lead"] = line.split(" - ")[-1].strip()
+    elif "no actionable request" in line:
+        cur["type"] = "none"
+    elif "skipping card" in line:
+        cur["type"] = "skipped"
+    else:
+        m = SAVED.search(line)
+        if m:
+            cur["type"] = m.group(1)              # appointment | callback
+            cur["lead"] = f"[{m.group(2)}] {m.group(3)}"

 all_lat = []
-nonresp_total = 0
 print(f"\n=== Call scorecard ({LOG}) — {len(calls)} call(s) ===\n")
 for i, c in enumerate(calls, 1):
-    turns = len([s for s in c["stt"] if s[1]])
+    turns = len([s for s in c["stt"] if s])
    lat = sorted(c["lat"])
    p95 = lat[min(len(lat) - 1, int(0.95 * len(lat)))] if lat else 0
    all_lat += c["lat"]
-    print(f"Call {i} {c['sid']}: turns={turns} | "
-          f"latency med={ (lat[len(lat)//2] if lat else 0):.2f}s p95={p95:.2f}s | "
-          f"closing={'Y' if c['closing'] else '-'} hangup={'Y' if c['hangup'] else '-'} | "
-          f"reprompts={c['reprompts']} | lead={'Y' if c['lead'] else 'no'}")
+    print(f"Call {i} {c['sid']}: type={c['type'] or '—':11s} turns={turns} | "
+          f"lat med={(lat[len(lat)//2] if lat else 0):.2f}s p95={p95:.2f}s | "
+          f"hangup={'Y' if c['hangup'] else '-'} reprompts={c['reprompts']} | "
+          f"lead={'Y' if c['lead'] else 'no'}")
    if c["lead"]:
        print(f"         {c['lead']}")

 al = sorted(all_lat)
 p95 = al[min(len(al) - 1, int(0.95 * len(al)))] if al else 0
+types = Counter(c["type"] or "incomplete" for c in calls)
+print("\n=== By outcome type ===")
+for k in ("appointment", "callback", "none", "skipped", "incomplete"):
+    if types.get(k):
+        print(f"  {k:11s}: {types[k]}")
+
 print("\n=== Phase 1 gate ===")
 print(f"  calls placed                 : {len(calls)}  (gate wants 10 consecutive clean)")
 print(f"  errors / tracebacks          : {err}  ({'PASS' if err == 0 else 'FAIL'})")
 print(f"  AVC-side hangups             : {sum(c['hangup'] for c in calls)}/{len(calls)}")
-print(f"  leads written                : {sum(1 for c in calls if c['lead'])}/{len(calls)}")
+print(f"  leads written                : {sum(1 for c in calls if c['lead'])}/{len(calls)} "
+      f"(appointment {types.get('appointment', 0)} + callback {types.get('callback', 0)})")
 print(f"  watchdog re-prompts (silence): {sum(c['reprompts'] for c in calls)}")
 print(f"  latency P95 (LLM->TTS)       : {p95:.2f}s  ({'PASS <3s' if p95 < 3 else 'CHECK'})")
 print(f"\n  recordings/ has the stereo WAVs (caller=L, agent=R) for audio review.")