Files
odoo-ai/agent_service/llm/ollama_backend.py
Carlos Garcia 564f1a9479 fix: raise Ollama timeout to 300s, add model pre-warming, improve health check
- OllamaBackend enforces _MIN_TIMEOUT=300s (overrides OLLAMA_TIMEOUT env var)
- warm_model() background task loads activeblue-chat into VRAM at startup
- health/detailed reports "warming" vs "ok" via Ollama ps() API
- README updated with May 2026 changes and test coverage details
2026-05-20 05:03:15 +00:00

104 lines
4.8 KiB
Python

from __future__ import annotations
import asyncio, logging, time
from .llm_types import LLMResponse, OllamaTimeoutError, OllamaUnavailableError
logger = logging.getLogger(__name__)
class OllamaBackend:
_MIN_TIMEOUT = 300 # activeblue-chat needs ~124s to load from disk
def __init__(self, url, model, timeout=300, max_concurrent=2):
self._url = url
self._model = model
self._timeout = max(timeout, self._MIN_TIMEOUT)
self._semaphore = asyncio.Semaphore(max_concurrent)
self._active = 0
async def submit(self, messages, tools=None, caller='unknown'):
import ollama
wait_start = time.monotonic()
async with self._semaphore:
wait_ms = int((time.monotonic() - wait_start) * 1000)
self._active += 1
t0 = time.monotonic()
try:
kwargs = {'model': self._model, 'messages': messages}
if tools:
kwargs['tools'] = tools
# Force structured JSON output for callers that parse JSON responses.
# Without this llama3.1:8b returns plain English instead of JSON.
_JSON_CALLERS = {'master', 'expenses_agent_receipt_parser'}
if caller in _JSON_CALLERS and not tools:
kwargs['format'] = 'json'
client = ollama.AsyncClient(host=self._url)
try:
response = await asyncio.wait_for(client.chat(**kwargs), timeout=self._timeout)
except asyncio.TimeoutError:
raise OllamaTimeoutError(f'Ollama timeout after {self._timeout}s caller={caller}')
except Exception as exc:
s = str(exc).lower()
if 'connect' in s or 'refused' in s or 'unreachable' in s:
raise OllamaUnavailableError(f'Ollama unreachable: {exc}') from exc
raise OllamaUnavailableError(f'Ollama error: {exc}') from exc
ms = int((time.monotonic() - t0) * 1000)
# ollama-python returns dicts in 0.3.x, pydantic objects in newer
# releases — accept either shape.
def _get(obj, key, default=None):
if isinstance(obj, dict):
return obj.get(key, default)
return getattr(obj, key, default)
msg = _get(response, 'message') or {}
raw_tool_calls = _get(msg, 'tool_calls')
tool_calls = None
if raw_tool_calls:
tool_calls = []
for tc in raw_tool_calls:
fn = _get(tc, 'function') or {}
tool_calls.append({
'name': _get(fn, 'name'),
'arguments': _get(fn, 'arguments'),
})
tin = _get(response, 'prompt_eval_count') or 0
tout = _get(response, 'eval_count') or 0
logger.info('ollama caller=%s wait_ms=%d inf_ms=%d tin=%d tout=%d',
caller, wait_ms, ms, tin, tout)
return LLMResponse(content=_get(msg, 'content') or '', tool_calls=tool_calls,
backend_used='ollama', model_used=self._model,
tokens_in=tin, tokens_out=tout, latency_ms=ms)
finally:
self._active -= 1
async def ping(self) -> None:
"""Raise if Ollama is unreachable."""
import ollama
client = ollama.AsyncClient(host=self._url)
try:
await asyncio.wait_for(client.list(), timeout=5)
except asyncio.TimeoutError:
raise OllamaUnavailableError('Ollama ping timed out')
except Exception as exc:
raise OllamaUnavailableError(f'Ollama ping failed: {exc}') from exc
async def warm_model(self) -> None:
"""Pre-load the configured model into VRAM via a minimal inference call."""
import ollama
logger.info('ollama warm_model=%s starting (timeout=%ds)', self._model, self._timeout)
t0 = time.monotonic()
try:
client = ollama.AsyncClient(host=self._url)
await asyncio.wait_for(
client.chat(model=self._model, messages=[{'role': 'user', 'content': 'hi'}]),
timeout=self._timeout,
)
ms = int((time.monotonic() - t0) * 1000)
logger.info('ollama warm_model=%s ready in %dms', self._model, ms)
except asyncio.TimeoutError:
logger.warning('ollama warm_model=%s timed out after %ds — model may still be loading',
self._model, self._timeout)
except Exception as exc:
logger.warning('ollama warm_model=%s failed: %s', self._model, exc)
@property
def active_count(self): return self._active