feat: add LLM abstraction layer (router, Ollama backend, Claude backend)

2026-04-12 16:46:18 -04:00
parent 0e13b93e3a
commit 7d92c2ea6f
7 changed files with 394 additions and 0 deletions
--- a/agent_service/llm/claude_backend.py
+++ b/agent_service/llm/claude_backend.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+import asyncio, logging, time
+from typing import Any
+from .llm_types import LLMResponse, ClaudeTimeoutError, ClaudeAuthError, ClaudeRateLimitError
+
+logger = logging.getLogger(__name__)
+
+
+def _to_claude_tools(tools):
+    result = []
+    for t in tools:
+        params = t.get('parameters', {})
+        props, req = {}, []
+        for k, v in params.items():
+            props[k] = {k2: v2 for k2, v2 in v.items() if k2 != 'optional'}
+            if not v.get('optional', False):
+                req.append(k)
+        result.append({'name': t['name'], 'description': t.get('description', t['name']),
+                       'input_schema': {'type': 'object', 'properties': props, 'required': req}})
+    return result
+
+
+class ClaudeBackend:
+    def __init__(self, api_key, model, timeout=60, max_concurrent=5):
+        import anthropic
+        self._client = anthropic.AsyncAnthropic(api_key=api_key)
+        self._model = model
+        self._timeout = timeout
+        self._semaphore = asyncio.Semaphore(max_concurrent)
+        self._active = 0
+
+    async def submit(self, messages, tools=None, caller='unknown'):
+        import anthropic
+        wait_start = time.monotonic()
+        async with self._semaphore:
+            wait_ms = int((time.monotonic() - wait_start) * 1000)
+            self._active += 1
+            t0 = time.monotonic()
+            try:
+                system = None
+                conv = []
+                for m in messages:
+                    if m.get('role') == 'system':
+                        system = m['content']
+                    else:
+                        conv.append(m)
+                kw: dict[str, Any] = {'model': self._model, 'max_tokens': 4096, 'messages': conv}
+                if system:
+                    kw['system'] = system
+                if tools:
+                    kw['tools'] = _to_claude_tools(tools)
+                try:
+                    resp = await asyncio.wait_for(self._client.messages.create(**kw), timeout=self._timeout)
+                except asyncio.TimeoutError:
+                    raise ClaudeTimeoutError(f'Claude timeout after {self._timeout}s caller={caller}')
+                except anthropic.AuthenticationError as exc:
+                    raise ClaudeAuthError(f'Claude auth error: {exc}') from exc
+                except anthropic.RateLimitError:
+                    logger.warning('Claude rate limit, backing off 5s caller=%s', caller)
+                    await asyncio.sleep(5)
+                    try:
+                        resp = await asyncio.wait_for(self._client.messages.create(**kw), timeout=self._timeout)
+                    except anthropic.RateLimitError as exc2:
+                        raise ClaudeRateLimitError(f'Claude rate limit persists: {exc2}') from exc2
+                ms = int((time.monotonic() - t0) * 1000)
+                text, tool_calls = '', None
+                for block in resp.content:
+                    if block.type == 'text':
+                        text += block.text
+                    elif block.type == 'tool_use':
+                        if tool_calls is None: tool_calls = []
+                        tool_calls.append({'name': block.name, 'arguments': block.input})
+                tin, tout = resp.usage.input_tokens, resp.usage.output_tokens
+                cost = (tin * 3 + tout * 15) / 1_000_000
+                logger.info('claude caller=%s model=%s wait_ms=%d ms=%d tin=%d tout=%d cost=%.5f',
+                            caller, self._model, wait_ms, ms, tin, tout, cost)
+                return LLMResponse(content=text, tool_calls=tool_calls, backend_used='claude',
+                                   model_used=self._model, tokens_in=tin, tokens_out=tout, latency_ms=ms)
+            finally:
+                self._active -= 1
+
+    @property
+    def active_count(self): return self._active