ping() was calling ollama.AsyncClient.list() which parses /api/tags with ollama==0.3.3 pydantic models. Vision models carry metadata fields that 0.3.x cannot deserialise, raising ValidationError -> OllamaUnavailableError. This made the /health/detailed ollama field 'error: ...' instead of 'ok', so ab_ai_bot.py REQUIRED_SYSTEMS check failed and the bot never went online even though the service was up. Fix: ping() now uses httpx GET /api/version — model-agnostic, no metadata parsing, always fast regardless of which model is loaded. Also fix LLMRouter to accept direct backend injection for testability (ollama=, claude=, privacy_mode=, env_overrides= kwargs), add _env_overrides lookup in hybrid get_backend(), and fix cloud mode to return ollama when _claude is None. All 6 test_llm_router tests now pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
110 lines
5.1 KiB
Python
110 lines
5.1 KiB
Python
from __future__ import annotations
|
|
import asyncio, logging, time
|
|
from .llm_types import LLMResponse, OllamaTimeoutError, OllamaUnavailableError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OllamaBackend:
|
|
_MIN_TIMEOUT = 300 # activeblue-chat needs ~124s to load from disk
|
|
|
|
def __init__(self, url, model, timeout=300, max_concurrent=2):
|
|
self._url = url
|
|
self._model = model
|
|
self._timeout = max(timeout, self._MIN_TIMEOUT)
|
|
self._semaphore = asyncio.Semaphore(max_concurrent)
|
|
self._active = 0
|
|
|
|
async def submit(self, messages, tools=None, caller='unknown'):
|
|
import ollama
|
|
wait_start = time.monotonic()
|
|
async with self._semaphore:
|
|
wait_ms = int((time.monotonic() - wait_start) * 1000)
|
|
self._active += 1
|
|
t0 = time.monotonic()
|
|
try:
|
|
kwargs = {'model': self._model, 'messages': messages}
|
|
if tools:
|
|
kwargs['tools'] = tools
|
|
# Force structured JSON output for callers that parse JSON responses.
|
|
# Without this llama3.1:8b returns plain English instead of JSON.
|
|
_JSON_CALLERS = {'master', 'expenses_agent_receipt_parser'}
|
|
if caller in _JSON_CALLERS and not tools:
|
|
kwargs['format'] = 'json'
|
|
client = ollama.AsyncClient(host=self._url)
|
|
try:
|
|
response = await asyncio.wait_for(client.chat(**kwargs), timeout=self._timeout)
|
|
except asyncio.TimeoutError:
|
|
raise OllamaTimeoutError(f'Ollama timeout after {self._timeout}s caller={caller}')
|
|
except Exception as exc:
|
|
s = str(exc).lower()
|
|
if 'connect' in s or 'refused' in s or 'unreachable' in s:
|
|
raise OllamaUnavailableError(f'Ollama unreachable: {exc}') from exc
|
|
raise OllamaUnavailableError(f'Ollama error: {exc}') from exc
|
|
ms = int((time.monotonic() - t0) * 1000)
|
|
# ollama-python returns dicts in 0.3.x, pydantic objects in newer
|
|
# releases — accept either shape.
|
|
def _get(obj, key, default=None):
|
|
if isinstance(obj, dict):
|
|
return obj.get(key, default)
|
|
return getattr(obj, key, default)
|
|
msg = _get(response, 'message') or {}
|
|
raw_tool_calls = _get(msg, 'tool_calls')
|
|
tool_calls = None
|
|
if raw_tool_calls:
|
|
tool_calls = []
|
|
for tc in raw_tool_calls:
|
|
fn = _get(tc, 'function') or {}
|
|
tool_calls.append({
|
|
'name': _get(fn, 'name'),
|
|
'arguments': _get(fn, 'arguments'),
|
|
})
|
|
tin = _get(response, 'prompt_eval_count') or 0
|
|
tout = _get(response, 'eval_count') or 0
|
|
logger.info('ollama caller=%s wait_ms=%d inf_ms=%d tin=%d tout=%d',
|
|
caller, wait_ms, ms, tin, tout)
|
|
return LLMResponse(content=_get(msg, 'content') or '', tool_calls=tool_calls,
|
|
backend_used='ollama', model_used=self._model,
|
|
tokens_in=tin, tokens_out=tout, latency_ms=ms)
|
|
finally:
|
|
self._active -= 1
|
|
|
|
async def ping(self) -> None:
|
|
"""Raise if Ollama is unreachable.
|
|
|
|
Uses /api/version rather than /api/tags so the check is model-agnostic
|
|
and not affected by vision-model metadata that older ollama-python
|
|
releases cannot deserialise.
|
|
"""
|
|
import httpx
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
r = await client.get(f'{self._url}/api/version', timeout=5.0)
|
|
r.raise_for_status()
|
|
except httpx.TimeoutException:
|
|
raise OllamaUnavailableError('Ollama ping timed out')
|
|
except Exception as exc:
|
|
raise OllamaUnavailableError(f'Ollama ping failed: {exc}') from exc
|
|
|
|
async def warm_model(self) -> None:
|
|
"""Pre-load the configured model into VRAM via a minimal inference call."""
|
|
import ollama
|
|
logger.info('ollama warm_model=%s starting (timeout=%ds)', self._model, self._timeout)
|
|
t0 = time.monotonic()
|
|
try:
|
|
client = ollama.AsyncClient(host=self._url)
|
|
await asyncio.wait_for(
|
|
client.chat(model=self._model, messages=[{'role': 'user', 'content': 'hi'}]),
|
|
timeout=self._timeout,
|
|
)
|
|
ms = int((time.monotonic() - t0) * 1000)
|
|
logger.info('ollama warm_model=%s ready in %dms', self._model, ms)
|
|
except asyncio.TimeoutError:
|
|
logger.warning('ollama warm_model=%s timed out after %ds — model may still be loading',
|
|
self._model, self._timeout)
|
|
except Exception as exc:
|
|
logger.warning('ollama warm_model=%s failed: %s', self._model, exc)
|
|
|
|
@property
|
|
def active_count(self): return self._active
|