fix: raise Ollama timeout to 300s, add model pre-warming, improve health check
- OllamaBackend enforces _MIN_TIMEOUT=300s (overrides OLLAMA_TIMEOUT env var) - warm_model() background task loads activeblue-chat into VRAM at startup - health/detailed reports "warming" vs "ok" via Ollama ps() API - README updated with May 2026 changes and test coverage details
This commit is contained in:
@@ -14,7 +14,7 @@ class Settings(BaseSettings):
|
||||
# Ollama
|
||||
ollama_url: str = 'http://localhost:11434'
|
||||
ollama_model: str = 'activeblue-chat'
|
||||
ollama_timeout: int = 120
|
||||
ollama_timeout: int = 300
|
||||
ollama_max_concurrent: int = 2
|
||||
# Set to a vision-capable model (e.g. llama3.2-vision:11b) to use
|
||||
# vision OCR for receipt images instead of Tesseract. Leave empty
|
||||
|
||||
@@ -6,10 +6,12 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OllamaBackend:
|
||||
def __init__(self, url, model, timeout=120, max_concurrent=2):
|
||||
_MIN_TIMEOUT = 300 # activeblue-chat needs ~124s to load from disk
|
||||
|
||||
def __init__(self, url, model, timeout=300, max_concurrent=2):
|
||||
self._url = url
|
||||
self._model = model
|
||||
self._timeout = timeout
|
||||
self._timeout = max(timeout, self._MIN_TIMEOUT)
|
||||
self._semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self._active = 0
|
||||
|
||||
@@ -78,5 +80,24 @@ class OllamaBackend:
|
||||
except Exception as exc:
|
||||
raise OllamaUnavailableError(f'Ollama ping failed: {exc}') from exc
|
||||
|
||||
async def warm_model(self) -> None:
|
||||
"""Pre-load the configured model into VRAM via a minimal inference call."""
|
||||
import ollama
|
||||
logger.info('ollama warm_model=%s starting (timeout=%ds)', self._model, self._timeout)
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
client = ollama.AsyncClient(host=self._url)
|
||||
await asyncio.wait_for(
|
||||
client.chat(model=self._model, messages=[{'role': 'user', 'content': 'hi'}]),
|
||||
timeout=self._timeout,
|
||||
)
|
||||
ms = int((time.monotonic() - t0) * 1000)
|
||||
logger.info('ollama warm_model=%s ready in %dms', self._model, ms)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning('ollama warm_model=%s timed out after %ds — model may still be loading',
|
||||
self._model, self._timeout)
|
||||
except Exception as exc:
|
||||
logger.warning('ollama warm_model=%s failed: %s', self._model, exc)
|
||||
|
||||
@property
|
||||
def active_count(self): return self._active
|
||||
|
||||
@@ -74,6 +74,8 @@ async def lifespan(app: FastAPI):
|
||||
llm_router = LLMRouter(config=settings, pg_pool=pool)
|
||||
app_state.set_llm_router(llm_router)
|
||||
logger.info('LLM router ready (mode=%s)', settings.llm_privacy_mode)
|
||||
if hasattr(llm_router, '_ollama') and llm_router._ollama:
|
||||
asyncio.create_task(_prewarm_ollama(llm_router._ollama))
|
||||
except Exception as exc:
|
||||
logger.error('Failed to init LLM router: %s', exc)
|
||||
llm_router = None
|
||||
@@ -199,6 +201,12 @@ def _register_specialist_agents(agent_registry, peer_bus, odoo, llm_router) -> N
|
||||
logger.warning('Could not register %s: %s', agent_name, exc)
|
||||
|
||||
|
||||
async def _prewarm_ollama(ollama_backend) -> None:
|
||||
"""Load the configured model into VRAM at startup to avoid cold-start timeouts."""
|
||||
await asyncio.sleep(5)
|
||||
await ollama_backend.warm_model()
|
||||
|
||||
|
||||
async def _auto_heal_loop(interval: int = 120) -> None:
|
||||
"""Check health every interval seconds; call sysops_agent.auto_heal() if degraded."""
|
||||
await asyncio.sleep(90) # let startup settle before first check
|
||||
|
||||
@@ -63,6 +63,8 @@ async def _get_failing_systems() -> list[str]:
|
||||
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
|
||||
except Exception:
|
||||
failing.append('ollama')
|
||||
elif not llm_router:
|
||||
failing.append('ollama')
|
||||
|
||||
return failing
|
||||
|
||||
@@ -96,13 +98,25 @@ async def health_detailed():
|
||||
except Exception as exc:
|
||||
odoo_status = f'error: {exc}'
|
||||
|
||||
# Ollama check
|
||||
# Ollama check — verify reachability and that the configured model is loaded
|
||||
ollama_status = 'unavailable'
|
||||
llm_router = get_llm_router()
|
||||
if llm_router and hasattr(llm_router, '_ollama'):
|
||||
try:
|
||||
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
|
||||
ollama_status = 'ok'
|
||||
# Check whether the model is already warm in VRAM
|
||||
import ollama as _ollama_pkg
|
||||
client = _ollama_pkg.AsyncClient(host=llm_router._ollama._url)
|
||||
try:
|
||||
ps_resp = await asyncio.wait_for(client.ps(), timeout=5)
|
||||
loaded = getattr(ps_resp, 'models', ps_resp) if not isinstance(ps_resp, dict) else ps_resp.get('models', [])
|
||||
model_names = [getattr(m, 'model', None) or (m.get('model') if isinstance(m, dict) else None) for m in loaded]
|
||||
if any(llm_router._ollama._model in (n or '') for n in model_names):
|
||||
ollama_status = 'ok'
|
||||
else:
|
||||
ollama_status = 'warming'
|
||||
except Exception:
|
||||
ollama_status = 'ok' # ps() unsupported — treat as ok if ping succeeded
|
||||
except Exception as exc:
|
||||
ollama_status = f'error: {exc}'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user