diff --git a/extract.py b/extract.py
index 307bc4a..6059c96 100644
--- a/extract.py
+++ b/extract.py
@@ -25,7 +25,10 @@ _EXTRACT_INSTRUCTIONS = (
     "best one, false if they said to use a different number, null if it never came up\n"
     '  "alternate_number": string or null — a different callback number the caller gave, digits only\n'
     '  "location": string or null (which office/city)\n'
-    '  "reason": string or null (e.g. eye exam, broken glasses)\n'
+    '  "reason": string or null — WHY they want to be seen: the visit type OR the eye '
+    "problem/symptom they describe. Capture symptoms too (e.g. \"annual eye exam\", \"blurry "
+    'vision", "vision loss / blindness", "eye pain", "broken glasses", "red eye"). If they '
+    "describe any eye or vision problem, that IS the reason.\n"
     '  "insurance": string or null — the insurance plan the caller named, exactly as they said it\n'
     '  "preferred_time": string or null — the day/time in the caller\'s own words\n'
     '  "resolved_date": string or null — the actual calendar date the caller means as YYYY-MM-DD, '
diff --git a/server.py b/server.py
index 7c4efef..76d9a1f 100644
--- a/server.py
+++ b/server.py
@@ -26,7 +26,9 @@ import hmac
 import json
 import os
 import secrets
+from contextlib import asynccontextmanager
 
+import httpx
 from fastapi import FastAPI, Request, WebSocket
 from fastapi.responses import HTMLResponse
 from loguru import logger
@@ -62,16 +64,11 @@ BUSY_MESSAGE = os.environ.get(
     "Please call back in a few minutes. Goodbye.",
 )
 
-app = FastAPI()
-
-
-@app.on_event("startup")
-async def _warm_llm():
-    """Pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold model
-    reload. Cold reloads were adding ~3s of dead air to the first reply; latency is otherwise
-    LLM-side (Whisper STT is ~0.1s). Best-effort — a failure here never blocks startup."""
-    import httpx
-
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """On startup, pin the LLM in VRAM (keep_alive=-1) so the first turn of a call isn't a cold
+    model reload. Cold reloads were adding ~3s of dead air to the first reply; latency is
+    otherwise LLM-side (Whisper STT is ~0.1s). Best-effort — a failure never blocks startup."""
     base = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/v1").rstrip("/")
     if base.endswith("/v1"):
         base = base[:-3]
@@ -83,8 +80,11 @@ async def _warm_llm():
         logger.info(f"Warmed + pinned Ollama model {model} (keep_alive=-1)")
     except Exception as e:
         logger.warning(f"LLM warmup failed (first call may be slow): {e!r}")
+    yield
 
 
+app = FastAPI(lifespan=lifespan)
+
 # Live count of active /ws pipelines (the real GPU consumers), guarded by a lock.
 _active_calls = 0
 _active_lock = asyncio.Lock()