Capture symptom reasons; switch startup warmup to lifespan

- Reason extraction missed symptom-style reasons: a caller said "I'm actually blind" and the lead logged reason=None (it caught "disintegrated eyes" before but not this). Broadened the extractor's reason rule to capture the eye problem/symptom as the reason, not just visit types. Verified 3/3 -> "vision loss / blindness". - server.py: move the LLM warmup/pin (keep_alive=-1) from the deprecated on_event("startup") to a lifespan handler — silences the FastAPI deprecation warning; model still shows ollama ps UNTIL=Forever. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 04:37:45 +00:00
parent ba36ae6891
commit 550550975f
2 changed files with 14 additions and 11 deletions
--- a/server.py
+++ b/server.py
@@ -26,7 +26,9 @@ import hmac
 import json
 import os
 import secrets
+from contextlib import asynccontextmanager

+import httpx
 from fastapi import FastAPI, Request, WebSocket
 from fastapi.responses import HTMLResponse
 from loguru import logger
@@ -62,16 +64,11 @@ BUSY_MESSAGE = os.environ.get(
    "Please call back in a few minutes. Goodbye.",
 )

-app = FastAPI()
-
-
-@app.on_event("startup")
-async def _warm_llm():
-    """Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model
-    reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise
-    LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup."""
-    import httpx
-
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """On startup, pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold
+    model reload. Cold reloads were adding ~3s of dead air to the first reply; latency is
+    otherwise LLM-side (Whisper STT is ~0.1s). Best-effort — a failure never blocks startup."""
    base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/")
    if base.endswith("/v1"):
        base = base[:-3]
@@ -83,8 +80,11 @@ async def _warm_llm():
        logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)")
    except Exception as e:
        logger.warning(f"LLM warmup failed (first call may be slow): {e!r}")
+    yield


+app = FastAPI(lifespan=lifespan)
+
 # Live count of active /ws pipelines (the real GPU consumers), guarded by a lock.
 _active_calls = 0
 _active_lock = asyncio.Lock()