from __future__ import annotations import asyncio, logging, time from typing import Any from .llm_types import LLMResponse, ClaudeTimeoutError, ClaudeAuthError, ClaudeRateLimitError logger = logging.getLogger(__name__) def _to_claude_tools(tools): result = [] for t in tools: params = t.get('parameters', {}) props, req = {}, [] for k, v in params.items(): props[k] = {k2: v2 for k2, v2 in v.items() if k2 != 'optional'} if not v.get('optional', False): req.append(k) result.append({'name': t['name'], 'description': t.get('description', t['name']), 'input_schema': {'type': 'object', 'properties': props, 'required': req}}) return result class ClaudeBackend: def __init__(self, api_key, model, timeout=60, max_concurrent=5): import anthropic self._client = anthropic.AsyncAnthropic(api_key=api_key) self._model = model self._timeout = timeout self._semaphore = asyncio.Semaphore(max_concurrent) self._active = 0 async def submit(self, messages, tools=None, caller='unknown'): import anthropic wait_start = time.monotonic() async with self._semaphore: wait_ms = int((time.monotonic() - wait_start) * 1000) self._active += 1 t0 = time.monotonic() try: system = None conv = [] for m in messages: if m.get('role') == 'system': system = m['content'] else: conv.append(m) kw: dict[str, Any] = {'model': self._model, 'max_tokens': 4096, 'messages': conv} if system: kw['system'] = system if tools: kw['tools'] = _to_claude_tools(tools) try: resp = await asyncio.wait_for(self._client.messages.create(**kw), timeout=self._timeout) except asyncio.TimeoutError: raise ClaudeTimeoutError(f'Claude timeout after {self._timeout}s caller={caller}') except anthropic.AuthenticationError as exc: raise ClaudeAuthError(f'Claude auth error: {exc}') from exc except anthropic.RateLimitError: logger.warning('Claude rate limit, backing off 5s caller=%s', caller) await asyncio.sleep(5) try: resp = await asyncio.wait_for(self._client.messages.create(**kw), timeout=self._timeout) except anthropic.RateLimitError as exc2: raise ClaudeRateLimitError(f'Claude rate limit persists: {exc2}') from exc2 ms = int((time.monotonic() - t0) * 1000) text, tool_calls = '', None for block in resp.content: if block.type == 'text': text += block.text elif block.type == 'tool_use': if tool_calls is None: tool_calls = [] tool_calls.append({'name': block.name, 'arguments': block.input}) tin, tout = resp.usage.input_tokens, resp.usage.output_tokens cost = (tin * 3 + tout * 15) / 1_000_000 logger.info('claude caller=%s model=%s wait_ms=%d ms=%d tin=%d tout=%d cost=%.5f', caller, self._model, wait_ms, ms, tin, tout, cost) return LLMResponse(content=text, tool_calls=tool_calls, backend_used='claude', model_used=self._model, tokens_in=tin, tokens_out=tout, latency_ms=ms) finally: self._active -= 1 @property def active_count(self): return self._active