fix: raise Ollama timeout to 300s, add model pre-warming, improve health check

- OllamaBackend enforces _MIN_TIMEOUT=300s (overrides OLLAMA_TIMEOUT env var)
- warm_model() background task loads activeblue-chat into VRAM at startup
- health/detailed reports "warming" vs "ok" via Ollama ps() API
- README updated with May 2026 changes and test coverage details
This commit is contained in:
2026-05-20 05:03:15 +00:00
parent 20a69313d7
commit 564f1a9479
5 changed files with 72 additions and 6 deletions

View File

@@ -6,10 +6,12 @@ logger = logging.getLogger(__name__)
class OllamaBackend:
def __init__(self, url, model, timeout=120, max_concurrent=2):
_MIN_TIMEOUT = 300 # activeblue-chat needs ~124s to load from disk
def __init__(self, url, model, timeout=300, max_concurrent=2):
self._url = url
self._model = model
self._timeout = timeout
self._timeout = max(timeout, self._MIN_TIMEOUT)
self._semaphore = asyncio.Semaphore(max_concurrent)
self._active = 0
@@ -78,5 +80,24 @@ class OllamaBackend:
except Exception as exc:
raise OllamaUnavailableError(f'Ollama ping failed: {exc}') from exc
async def warm_model(self) -> None:
"""Pre-load the configured model into VRAM via a minimal inference call."""
import ollama
logger.info('ollama warm_model=%s starting (timeout=%ds)', self._model, self._timeout)
t0 = time.monotonic()
try:
client = ollama.AsyncClient(host=self._url)
await asyncio.wait_for(
client.chat(model=self._model, messages=[{'role': 'user', 'content': 'hi'}]),
timeout=self._timeout,
)
ms = int((time.monotonic() - t0) * 1000)
logger.info('ollama warm_model=%s ready in %dms', self._model, ms)
except asyncio.TimeoutError:
logger.warning('ollama warm_model=%s timed out after %ds — model may still be loading',
self._model, self._timeout)
except Exception as exc:
logger.warning('ollama warm_model=%s failed: %s', self._model, exc)
@property
def active_count(self): return self._active