From 550550975f59b1d21185ba1498f194a9e203b0cc Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Sat, 27 Jun 2026 04:37:45 +0000 Subject: [PATCH] Capture symptom reasons; switch startup warmup to lifespan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reason extraction missed symptom-style reasons: a caller said "I'm actually blind" and the lead logged reason=None (it caught "disintegrated eyes" before but not this). Broadened the extractor's reason rule to capture the eye problem/symptom as the reason, not just visit types. Verified 3/3 -> "vision loss / blindness". - server.py: move the LLM warmup/pin (keep_alive=-1) from the deprecated on_event("startup") to a lifespan handler — silences the FastAPI deprecation warning; model still shows ollama ps UNTIL=Forever. Co-Authored-By: Claude Opus 4.8 --- extract.py | 5 ++++- server.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/extract.py b/extract.py index 307bc4a..6059c96 100644 --- a/extract.py +++ b/extract.py @@ -25,7 +25,10 @@ _EXTRACT_INSTRUCTIONS = ( "best one, false if they said to use a different number, null if it never came up\n" ' "alternate_number": string or null — a different callback number the caller gave, digits only\n' ' "location": string or null (which office/city)\n' - ' "reason": string or null (e.g. eye exam, broken glasses)\n' + ' "reason": string or null — WHY they want to be seen: the visit type OR the eye ' + "problem/symptom they describe. Capture symptoms too (e.g. \"annual eye exam\", \"blurry " + 'vision", "vision loss / blindness", "eye pain", "broken glasses", "red eye"). If they ' + "describe any eye or vision problem, that IS the reason.\n" ' "insurance": string or null — the insurance plan the caller named, exactly as they said it\n' ' "preferred_time": string or null — the day/time in the caller\'s own words\n' ' "resolved_date": string or null — the actual calendar date the caller means as YYYY-MM-DD, ' diff --git a/server.py b/server.py index 7c4efef..76d9a1f 100644 --- a/server.py +++ b/server.py @@ -26,7 +26,9 @@ import hmac import json import os import secrets +from contextlib import asynccontextmanager +import httpx from fastapi import FastAPI, Request, WebSocket from fastapi.responses import HTMLResponse from loguru import logger @@ -62,16 +64,11 @@ BUSY_MESSAGE = os.environ.get( "Please call back in a few minutes. Goodbye.", ) -app = FastAPI() - - -@app.on_event("startup") -async def _warm_llm(): - """Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model - reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise - LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup.""" - import httpx - +@asynccontextmanager +async def lifespan(app: FastAPI): + """On startup, pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold + model reload. Cold reloads were adding ~3s of dead air to the first reply; latency is + otherwise LLM-side (Whisper STT is ~0.1s). Best-effort — a failure never blocks startup.""" base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/") if base.endswith("/v1"): base = base[:-3] @@ -83,8 +80,11 @@ async def _warm_llm(): logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)") except Exception as e: logger.warning(f"LLM warmup failed (first call may be slow): {e!r}") + yield +app = FastAPI(lifespan=lifespan) + # Live count of active /ws pipelines (the real GPU consumers), guarded by a lock. _active_calls = 0 _active_lock = asyncio.Lock()