feat: add LLM abstraction layer (router, Ollama backend, Claude backend)
This commit is contained in:
54
agent_service/llm/ollama_backend.py
Normal file
54
agent_service/llm/ollama_backend.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
import asyncio, logging, time
|
||||
from .llm_types import LLMResponse, OllamaTimeoutError, OllamaUnavailableError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OllamaBackend:
|
||||
def __init__(self, url, model, timeout=120, max_concurrent=2):
|
||||
self._url = url
|
||||
self._model = model
|
||||
self._timeout = timeout
|
||||
self._semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self._active = 0
|
||||
|
||||
async def submit(self, messages, tools=None, caller='unknown'):
|
||||
import ollama
|
||||
wait_start = time.monotonic()
|
||||
async with self._semaphore:
|
||||
wait_ms = int((time.monotonic() - wait_start) * 1000)
|
||||
self._active += 1
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
kwargs = {'model': self._model, 'messages': messages}
|
||||
if tools:
|
||||
kwargs['tools'] = tools
|
||||
client = ollama.AsyncClient(host=self._url)
|
||||
try:
|
||||
response = await asyncio.wait_for(client.chat(**kwargs), timeout=self._timeout)
|
||||
except asyncio.TimeoutError:
|
||||
raise OllamaTimeoutError(f'Ollama timeout after {self._timeout}s caller={caller}')
|
||||
except Exception as exc:
|
||||
s = str(exc).lower()
|
||||
if 'connect' in s or 'refused' in s or 'unreachable' in s:
|
||||
raise OllamaUnavailableError(f'Ollama unreachable: {exc}') from exc
|
||||
raise OllamaUnavailableError(f'Ollama error: {exc}') from exc
|
||||
ms = int((time.monotonic() - t0) * 1000)
|
||||
msg = response.message
|
||||
tool_calls = None
|
||||
if hasattr(msg, 'tool_calls') and msg.tool_calls:
|
||||
tool_calls = [{'name': tc.function.name, 'arguments': tc.function.arguments}
|
||||
for tc in msg.tool_calls]
|
||||
tin = response.prompt_eval_count or 0
|
||||
tout = response.eval_count or 0
|
||||
logger.info('ollama caller=%s wait_ms=%d inf_ms=%d tin=%d tout=%d',
|
||||
caller, wait_ms, ms, tin, tout)
|
||||
return LLMResponse(content=msg.content or '', tool_calls=tool_calls,
|
||||
backend_used='ollama', model_used=self._model,
|
||||
tokens_in=tin, tokens_out=tout, latency_ms=ms)
|
||||
finally:
|
||||
self._active -= 1
|
||||
|
||||
@property
|
||||
def active_count(self): return self._active
|
||||
Reference in New Issue
Block a user