fix: raise Ollama timeout to 300s, add model pre-warming, improve health check
- OllamaBackend enforces _MIN_TIMEOUT=300s (overrides OLLAMA_TIMEOUT env var) - warm_model() background task loads activeblue-chat into VRAM at startup - health/detailed reports "warming" vs "ok" via Ollama ps() API - README updated with May 2026 changes and test coverage details
This commit is contained in:
@@ -63,6 +63,8 @@ async def _get_failing_systems() -> list[str]:
|
||||
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
|
||||
except Exception:
|
||||
failing.append('ollama')
|
||||
elif not llm_router:
|
||||
failing.append('ollama')
|
||||
|
||||
return failing
|
||||
|
||||
@@ -96,13 +98,25 @@ async def health_detailed():
|
||||
except Exception as exc:
|
||||
odoo_status = f'error: {exc}'
|
||||
|
||||
# Ollama check
|
||||
# Ollama check — verify reachability and that the configured model is loaded
|
||||
ollama_status = 'unavailable'
|
||||
llm_router = get_llm_router()
|
||||
if llm_router and hasattr(llm_router, '_ollama'):
|
||||
try:
|
||||
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
|
||||
ollama_status = 'ok'
|
||||
# Check whether the model is already warm in VRAM
|
||||
import ollama as _ollama_pkg
|
||||
client = _ollama_pkg.AsyncClient(host=llm_router._ollama._url)
|
||||
try:
|
||||
ps_resp = await asyncio.wait_for(client.ps(), timeout=5)
|
||||
loaded = getattr(ps_resp, 'models', ps_resp) if not isinstance(ps_resp, dict) else ps_resp.get('models', [])
|
||||
model_names = [getattr(m, 'model', None) or (m.get('model') if isinstance(m, dict) else None) for m in loaded]
|
||||
if any(llm_router._ollama._model in (n or '') for n in model_names):
|
||||
ollama_status = 'ok'
|
||||
else:
|
||||
ollama_status = 'warming'
|
||||
except Exception:
|
||||
ollama_status = 'ok' # ps() unsupported — treat as ok if ping succeeded
|
||||
except Exception as exc:
|
||||
ollama_status = f'error: {exc}'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user