feat: add LLM abstraction layer (router, Ollama backend, Claude backend)

2026-04-12 16:46:18 -04:00
parent 0e13b93e3a
commit 7d92c2ea6f
7 changed files with 394 additions and 0 deletions
--- a/agent_service/llm/ollama_backend.py
+++ b/agent_service/llm/ollama_backend.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+import asyncio, logging, time
+from .llm_types import LLMResponse, OllamaTimeoutError, OllamaUnavailableError
+
+logger = logging.getLogger(__name__)
+
+
+class OllamaBackend:
+    def __init__(self, url, model, timeout=120, max_concurrent=2):
+        self._url = url
+        self._model = model
+        self._timeout = timeout
+        self._semaphore = asyncio.Semaphore(max_concurrent)
+        self._active = 0
+
+    async def submit(self, messages, tools=None, caller='unknown'):
+        import ollama
+        wait_start = time.monotonic()
+        async with self._semaphore:
+            wait_ms = int((time.monotonic() - wait_start) * 1000)
+            self._active += 1
+            t0 = time.monotonic()
+            try:
+                kwargs = {'model': self._model, 'messages': messages}
+                if tools:
+                    kwargs['tools'] = tools
+                client = ollama.AsyncClient(host=self._url)
+                try:
+                    response = await asyncio.wait_for(client.chat(**kwargs), timeout=self._timeout)
+                except asyncio.TimeoutError:
+                    raise OllamaTimeoutError(f'Ollama timeout after {self._timeout}s caller={caller}')
+                except Exception as exc:
+                    s = str(exc).lower()
+                    if 'connect' in s or 'refused' in s or 'unreachable' in s:
+                        raise OllamaUnavailableError(f'Ollama unreachable: {exc}') from exc
+                    raise OllamaUnavailableError(f'Ollama error: {exc}') from exc
+                ms = int((time.monotonic() - t0) * 1000)
+                msg = response.message
+                tool_calls = None
+                if hasattr(msg, 'tool_calls') and msg.tool_calls:
+                    tool_calls = [{'name': tc.function.name, 'arguments': tc.function.arguments}
+                                  for tc in msg.tool_calls]
+                tin = response.prompt_eval_count or 0
+                tout = response.eval_count or 0
+                logger.info('ollama caller=%s wait_ms=%d inf_ms=%d tin=%d tout=%d',
+                            caller, wait_ms, ms, tin, tout)
+                return LLMResponse(content=msg.content or '', tool_calls=tool_calls,
+                                   backend_used='ollama', model_used=self._model,
+                                   tokens_in=tin, tokens_out=tout, latency_ms=ms)
+            finally:
+                self._active -= 1
+
+    @property
+    def active_count(self): return self._active