feat: add LLM abstraction layer (router, Ollama backend, Claude backend)
This commit is contained in:
83
agent_service/llm/claude_backend.py
Normal file
83
agent_service/llm/claude_backend.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
import asyncio, logging, time
|
||||
from typing import Any
|
||||
from .llm_types import LLMResponse, ClaudeTimeoutError, ClaudeAuthError, ClaudeRateLimitError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_claude_tools(tools):
|
||||
result = []
|
||||
for t in tools:
|
||||
params = t.get('parameters', {})
|
||||
props, req = {}, []
|
||||
for k, v in params.items():
|
||||
props[k] = {k2: v2 for k2, v2 in v.items() if k2 != 'optional'}
|
||||
if not v.get('optional', False):
|
||||
req.append(k)
|
||||
result.append({'name': t['name'], 'description': t.get('description', t['name']),
|
||||
'input_schema': {'type': 'object', 'properties': props, 'required': req}})
|
||||
return result
|
||||
|
||||
|
||||
class ClaudeBackend:
|
||||
def __init__(self, api_key, model, timeout=60, max_concurrent=5):
|
||||
import anthropic
|
||||
self._client = anthropic.AsyncAnthropic(api_key=api_key)
|
||||
self._model = model
|
||||
self._timeout = timeout
|
||||
self._semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self._active = 0
|
||||
|
||||
async def submit(self, messages, tools=None, caller='unknown'):
|
||||
import anthropic
|
||||
wait_start = time.monotonic()
|
||||
async with self._semaphore:
|
||||
wait_ms = int((time.monotonic() - wait_start) * 1000)
|
||||
self._active += 1
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
system = None
|
||||
conv = []
|
||||
for m in messages:
|
||||
if m.get('role') == 'system':
|
||||
system = m['content']
|
||||
else:
|
||||
conv.append(m)
|
||||
kw: dict[str, Any] = {'model': self._model, 'max_tokens': 4096, 'messages': conv}
|
||||
if system:
|
||||
kw['system'] = system
|
||||
if tools:
|
||||
kw['tools'] = _to_claude_tools(tools)
|
||||
try:
|
||||
resp = await asyncio.wait_for(self._client.messages.create(**kw), timeout=self._timeout)
|
||||
except asyncio.TimeoutError:
|
||||
raise ClaudeTimeoutError(f'Claude timeout after {self._timeout}s caller={caller}')
|
||||
except anthropic.AuthenticationError as exc:
|
||||
raise ClaudeAuthError(f'Claude auth error: {exc}') from exc
|
||||
except anthropic.RateLimitError:
|
||||
logger.warning('Claude rate limit, backing off 5s caller=%s', caller)
|
||||
await asyncio.sleep(5)
|
||||
try:
|
||||
resp = await asyncio.wait_for(self._client.messages.create(**kw), timeout=self._timeout)
|
||||
except anthropic.RateLimitError as exc2:
|
||||
raise ClaudeRateLimitError(f'Claude rate limit persists: {exc2}') from exc2
|
||||
ms = int((time.monotonic() - t0) * 1000)
|
||||
text, tool_calls = '', None
|
||||
for block in resp.content:
|
||||
if block.type == 'text':
|
||||
text += block.text
|
||||
elif block.type == 'tool_use':
|
||||
if tool_calls is None: tool_calls = []
|
||||
tool_calls.append({'name': block.name, 'arguments': block.input})
|
||||
tin, tout = resp.usage.input_tokens, resp.usage.output_tokens
|
||||
cost = (tin * 3 + tout * 15) / 1_000_000
|
||||
logger.info('claude caller=%s model=%s wait_ms=%d ms=%d tin=%d tout=%d cost=%.5f',
|
||||
caller, self._model, wait_ms, ms, tin, tout, cost)
|
||||
return LLMResponse(content=text, tool_calls=tool_calls, backend_used='claude',
|
||||
model_used=self._model, tokens_in=tin, tokens_out=tout, latency_ms=ms)
|
||||
finally:
|
||||
self._active -= 1
|
||||
|
||||
@property
|
||||
def active_count(self): return self._active
|
||||
Reference in New Issue
Block a user