From ba36ae6891ce95795e30189c639f68bc40d5d7ee Mon Sep 17 00:00:00 2001
From: tocmo0nlord <mr.garcia09@gmail.com>
Date: Sat, 27 Jun 2026 04:24:10 +0000
Subject: [PATCH] Log/surface the reason, pin LLM warm for latency, doc
 insurance rule
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Reason visibility: the reason WAS extracted ("disintegrated eyes") but only
  lived in the Odoo description note. Add it to the post-call log line and to
  the Odoo lead title so it's visible at a glance.
- Latency: split the timing — Whisper is ~0.1s, latency is LLM-side. The ~3s
  tail was cold model reloads after Ollama's keep-alive expired. server.py now
  warms + pins the model on startup (keep_alive=-1, ollama ps UNTIL=Forever),
  removing cold first-turn stalls. Whisper size left alone (not the bottleneck).
- CLAUDE.md: insurance rule (never suggest/guess the plan), latency note.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CLAUDE.md      | 11 ++++++++++-
 extract.py     |  3 ++-
 odoo_client.py |  2 +-
 server.py      | 21 +++++++++++++++++++++
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index c44f067..16ff55f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -277,7 +277,7 @@ Replies are kept to one short sentence.
 | Phone | Confirmed **near the end** (not led with); reads back the caller-ID — injected pre-spelled so it's said digit-by-digit — and if the caller declines, uses the number they give | `callback_number` (+ `phone_confirmed`) |
 | Office / city | Asks city/area; when the caller names a place that matches an office, **confirms that office and moves on** — never offers/compares other offices or asks them to choose; names the nearest only if nothing matches | folded into `reason` prefix |
 | Reason | Captured from the conversation | `reason` |
-| Insurance | **Log only** — asks the plan, never promises/confirms/denies coverage or treatment (even a listed plan); staff verify on callback | `insurance` (note: "log only — staff to verify") |
+| Insurance | **Log only, never suggest or guess** — asks open-endedly (no plan names read out), captures only what the caller says, never fills in/completes/guesses the plan (asks them to repeat if unclear), never promises/confirms/denies coverage or treatment even for a listed plan; staff verify on callback | `insurance` (note: "log only — staff to verify") |
 | Preferred day & time | **Capture & defer** — taken in the caller's own words; AVA does not compute or correct the date | `preferred_time` + best-effort resolved `YYYY-MM-DD` |
 
 ### Dates — capture & defer (do NOT compute in-call)
@@ -343,6 +343,15 @@ stalls = dead air. The capture changes made it worse by briefly injecting a 45-d
 headroom (RTX 5080 has the VRAM). Rebuild keeps the previous model as `activeblue-avc:pre-ctx8k`
 for rollback. Keep the live system prompt lean for the same reason.
 
+### Latency note — model is pinned warm
+
+Per-turn latency is **LLM-side**, not STT: Whisper runs ~0.1s (VAD-stop → transcript), while
+transcript → first TTS is ~0.26s median. The tail (P95 ~3s) came from **cold model reloads** —
+Ollama unloads after its keep-alive window, so the first reply of a call after an idle gap paid
+a ~3s load. Fix: `server.py` fires a startup warmup that pins the model with `keep_alive=-1`
+(`ollama ps` shows UNTIL = Forever). Residual ~3s spikes on some later turns are 8B generation
+variance. Switching Whisper size would NOT help — it's not the bottleneck.
+
 ### Why Q4_K_M not Q8_0
 
 Q8_0 consumed ~8.5GB VRAM for weights alone. Under telephony load this caused
diff --git a/extract.py b/extract.py
index a93fca7..307bc4a 100644
--- a/extract.py
+++ b/extract.py
@@ -123,6 +123,7 @@ async def extract_and_record(messages, ollama_url, model, call_sid=None, caller_
     where = persist_appointment(record)
     logger.info(
         f"Post-call appointment saved ({where}): {record['patient_name']} / "
-        f"{record['location']} / ins={record['insurance']} / when={record['preferred_time']}"
+        f"{record['location']} / reason={record['reason']} / ins={record['insurance']} / "
+        f"when={record['preferred_time']}"
     )
     return record
diff --git a/odoo_client.py b/odoo_client.py
index b248bab..44ec2a9 100644
--- a/odoo_client.py
+++ b/odoo_client.py
@@ -62,7 +62,7 @@ def create_appointment_request(patient_name, callback_number, reason, preferred_
                                insurance=None, call_sid=None):
     """Create the request in Odoo. Returns (model, record_id) or raises OdooError."""
     uid, models = _connect()
-    summary = f"📞 Phone appt request — {patient_name or 'caller'}"
+    summary = f"📞 Phone appt — {patient_name or 'caller'}" + (f": {reason}" if reason else "")
     # description is an Odoo HTML field — build with <br/> so it renders in the UI.
     rows = [
         ("Name", patient_name),
diff --git a/server.py b/server.py
index 407706b..7c4efef 100644
--- a/server.py
+++ b/server.py
@@ -64,6 +64,27 @@ BUSY_MESSAGE = os.environ.get(
 
 app = FastAPI()
 
+
+@app.on_event("startup")
+async def _warm_llm():
+    """Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model
+    reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise
+    LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup."""
+    import httpx
+
+    base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/")
+    if base.endswith("/v1"):
+        base = base[:-3]
+    model = os.environ.get("OLLAMA_MODEL", "activeblue-avc:latest")
+    try:
+        async with httpx.AsyncClient(timeout=120) as c:
+            await c.post(f"{base}/api/generate",
+                         json={"model": model, "prompt": "ok", "stream": False, "keep_alive": -1})
+        logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)")
+    except Exception as e:
+        logger.warning(f"LLM warmup failed (first call may be slow): {e!r}")
+
+
 # Live count of active /ws pipelines (the real GPU consumers), guarded by a lock.
 _active_calls = 0
 _active_lock = asyncio.Lock()