Capture symptom reasons; switch startup warmup to lifespan
- Reason extraction missed symptom-style reasons: a caller said "I'm actually
blind" and the lead logged reason=None (it caught "disintegrated eyes" before
but not this). Broadened the extractor's reason rule to capture the eye
problem/symptom as the reason, not just visit types. Verified 3/3 -> "vision
loss / blindness".
- server.py: move the LLM warmup/pin (keep_alive=-1) from the deprecated
on_event("startup") to a lifespan handler — silences the FastAPI deprecation
warning; model still shows ollama ps UNTIL=Forever.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -25,7 +25,10 @@ _EXTRACT_INSTRUCTIONS = (
|
||||
"best one, false if they said to use a different number, null if it never came up\n"
|
||||
' "alternate_number": string or null — a different callback number the caller gave, digits only\n'
|
||||
' "location": string or null (which office/city)\n'
|
||||
' "reason": string or null (e.g. eye exam, broken glasses)\n'
|
||||
' "reason": string or null — WHY they want to be seen: the visit type OR the eye '
|
||||
"problem/symptom they describe. Capture symptoms too (e.g. \"annual eye exam\", \"blurry "
|
||||
'vision", "vision loss / blindness", "eye pain", "broken glasses", "red eye"). If they '
|
||||
"describe any eye or vision problem, that IS the reason.\n"
|
||||
' "insurance": string or null — the insurance plan the caller named, exactly as they said it\n'
|
||||
' "preferred_time": string or null — the day/time in the caller\'s own words\n'
|
||||
' "resolved_date": string or null — the actual calendar date the caller means as YYYY-MM-DD, '
|
||||
|
||||
20
server.py
20
server.py
@@ -26,7 +26,9 @@ import hmac
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
from fastapi import FastAPI, Request, WebSocket
|
||||
from fastapi.responses import HTMLResponse
|
||||
from loguru import logger
|
||||
@@ -62,16 +64,11 @@ BUSY_MESSAGE = os.environ.get(
|
||||
"Please call back in a few minutes. Goodbye.",
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def _warm_llm():
|
||||
"""Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model
|
||||
reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise
|
||||
LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup."""
|
||||
import httpx
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""On startup, pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold
|
||||
model reload. Cold reloads were adding ~3s of dead air to the first reply; latency is
|
||||
otherwise LLM-side (Whisper STT is ~0.1s). Best-effort — a failure never blocks startup."""
|
||||
base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/")
|
||||
if base.endswith("/v1"):
|
||||
base = base[:-3]
|
||||
@@ -83,8 +80,11 @@ async def _warm_llm():
|
||||
logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)")
|
||||
except Exception as e:
|
||||
logger.warning(f"LLM warmup failed (first call may be slow): {e!r}")
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
# Live count of active /ws pipelines (the real GPU consumers), guarded by a lock.
|
||||
_active_calls = 0
|
||||
_active_lock = asyncio.Lock()
|
||||
|
||||
Reference in New Issue
Block a user