diff --git a/extract.py b/extract.py index 307bc4a..6059c96 100644 --- a/extract.py +++ b/extract.py @@ -25,7 +25,10 @@ _EXTRACT_INSTRUCTIONS = ( "best one, false if they said to use a different number, null if it never came up\n" ' "alternate_number": string or null — a different callback number the caller gave, digits only\n' ' "location": string or null (which office/city)\n' - ' "reason": string or null (e.g. eye exam, broken glasses)\n' + ' "reason": string or null — WHY they want to be seen: the visit type OR the eye ' + "problem/symptom they describe. Capture symptoms too (e.g. \"annual eye exam\", \"blurry " + 'vision", "vision loss / blindness", "eye pain", "broken glasses", "red eye"). If they ' + "describe any eye or vision problem, that IS the reason.\n" ' "insurance": string or null — the insurance plan the caller named, exactly as they said it\n' ' "preferred_time": string or null — the day/time in the caller\'s own words\n' ' "resolved_date": string or null — the actual calendar date the caller means as YYYY-MM-DD, ' diff --git a/server.py b/server.py index 7c4efef..76d9a1f 100644 --- a/server.py +++ b/server.py @@ -26,7 +26,9 @@ import hmac import json import os import secrets +from contextlib import asynccontextmanager +import httpx from fastapi import FastAPI, Request, WebSocket from fastapi.responses import HTMLResponse from loguru import logger @@ -62,16 +64,11 @@ BUSY_MESSAGE = os.environ.get( "Please call back in a few minutes. Goodbye.", ) -app = FastAPI() - - -@app.on_event("startup") -async def _warm_llm(): - """Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model - reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise - LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup.""" - import httpx - +@asynccontextmanager +async def lifespan(app: FastAPI): + """On startup, pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold + model reload. Cold reloads were adding ~3s of dead air to the first reply; latency is + otherwise LLM-side (Whisper STT is ~0.1s). Best-effort — a failure never blocks startup.""" base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/") if base.endswith("/v1"): base = base[:-3] @@ -83,8 +80,11 @@ async def _warm_llm(): logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)") except Exception as e: logger.warning(f"LLM warmup failed (first call may be slow): {e!r}") + yield +app = FastAPI(lifespan=lifespan) + # Live count of active /ws pipelines (the real GPU consumers), guarded by a lock. _active_calls = 0 _active_lock = asyncio.Lock()