Capture symptom reasons; switch startup warmup to lifespan

- Reason extraction missed symptom-style reasons: a caller said "I'm actually
  blind" and the lead logged reason=None (it caught "disintegrated eyes" before
  but not this). Broadened the extractor's reason rule to capture the eye
  problem/symptom as the reason, not just visit types. Verified 3/3 -> "vision
  loss / blindness".
- server.py: move the LLM warmup/pin (keep_alive=-1) from the deprecated
  on_event("startup") to a lifespan handler — silences the FastAPI deprecation
  warning; model still shows ollama ps UNTIL=Forever.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
tocmo0nlord
2026-06-27 04:37:45 +00:00
parent ba36ae6891
commit 550550975f
2 changed files with 14 additions and 11 deletions

View File

@@ -26,7 +26,9 @@ import hmac
import json
import os
import secrets
from contextlib import asynccontextmanager
import httpx
from fastapi import FastAPI, Request, WebSocket
from fastapi.responses import HTMLResponse
from loguru import logger
@@ -62,16 +64,11 @@ BUSY_MESSAGE = os.environ.get(
"Please call back in a few minutes. Goodbye.",
)
app = FastAPI()
@app.on_event("startup")
async def _warm_llm():
"""Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model
reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise
LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup."""
import httpx
@asynccontextmanager
async def lifespan(app: FastAPI):
"""On startup, pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold
model reload. Cold reloads were adding ~3s of dead air to the first reply; latency is
otherwise LLM-side (Whisper STT is ~0.1s). Best-effort — a failure never blocks startup."""
base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/")
if base.endswith("/v1"):
base = base[:-3]
@@ -83,8 +80,11 @@ async def _warm_llm():
logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)")
except Exception as e:
logger.warning(f"LLM warmup failed (first call may be slow): {e!r}")
yield
app = FastAPI(lifespan=lifespan)
# Live count of active /ws pipelines (the real GPU consumers), guarded by a lock.
_active_calls = 0
_active_lock = asyncio.Lock()