fix: raise Ollama timeout to 300s, add model pre-warming, improve health check

- OllamaBackend enforces _MIN_TIMEOUT=300s (overrides OLLAMA_TIMEOUT env var)
- warm_model() background task loads activeblue-chat into VRAM at startup
- health/detailed reports "warming" vs "ok" via Ollama ps() API
- README updated with May 2026 changes and test coverage details
This commit is contained in:
2026-05-20 05:03:15 +00:00
parent 20a69313d7
commit 564f1a9479
5 changed files with 72 additions and 6 deletions

View File

@@ -74,6 +74,8 @@ async def lifespan(app: FastAPI):
llm_router = LLMRouter(config=settings, pg_pool=pool)
app_state.set_llm_router(llm_router)
logger.info('LLM router ready (mode=%s)', settings.llm_privacy_mode)
if hasattr(llm_router, '_ollama') and llm_router._ollama:
asyncio.create_task(_prewarm_ollama(llm_router._ollama))
except Exception as exc:
logger.error('Failed to init LLM router: %s', exc)
llm_router = None
@@ -199,6 +201,12 @@ def _register_specialist_agents(agent_registry, peer_bus, odoo, llm_router) -> N
logger.warning('Could not register %s: %s', agent_name, exc)
async def _prewarm_ollama(ollama_backend) -> None:
"""Load the configured model into VRAM at startup to avoid cold-start timeouts."""
await asyncio.sleep(5)
await ollama_backend.warm_model()
async def _auto_heal_loop(interval: int = 120) -> None:
"""Check health every interval seconds; call sysops_agent.auto_heal() if degraded."""
await asyncio.sleep(90) # let startup settle before first check