from __future__ import annotations
import asyncio, logging, time
from .llm_types import LLMResponse, OllamaTimeoutError, OllamaUnavailableError

logger = logging.getLogger(__name__)


class OllamaBackend:
    def __init__(self, url, model, timeout=120, max_concurrent=2):
        self._url = url
        self._model = model
        self._timeout = timeout
        self._semaphore = asyncio.Semaphore(max_concurrent)
        self._active = 0

    async def submit(self, messages, tools=None, caller='unknown'):
        import ollama
        wait_start = time.monotonic()
        async with self._semaphore:
            wait_ms = int((time.monotonic() - wait_start) * 1000)
            self._active += 1
            t0 = time.monotonic()
            try:
                kwargs = {'model': self._model, 'messages': messages}
                if tools:
                    kwargs['tools'] = tools
                client = ollama.AsyncClient(host=self._url)
                try:
                    response = await asyncio.wait_for(client.chat(**kwargs), timeout=self._timeout)
                except asyncio.TimeoutError:
                    raise OllamaTimeoutError(f'Ollama timeout after {self._timeout}s caller={caller}')
                except Exception as exc:
                    s = str(exc).lower()
                    if 'connect' in s or 'refused' in s or 'unreachable' in s:
                        raise OllamaUnavailableError(f'Ollama unreachable: {exc}') from exc
                    raise OllamaUnavailableError(f'Ollama error: {exc}') from exc
                ms = int((time.monotonic() - t0) * 1000)
                msg = response.message
                tool_calls = None
                if hasattr(msg, 'tool_calls') and msg.tool_calls:
                    tool_calls = [{'name': tc.function.name, 'arguments': tc.function.arguments}
                                  for tc in msg.tool_calls]
                tin = response.prompt_eval_count or 0
                tout = response.eval_count or 0
                logger.info('ollama caller=%s wait_ms=%d inf_ms=%d tin=%d tout=%d',
                            caller, wait_ms, ms, tin, tout)
                return LLMResponse(content=msg.content or '', tool_calls=tool_calls,
                                   backend_used='ollama', model_used=self._model,
                                   tokens_in=tin, tokens_out=tout, latency_ms=ms)
            finally:
                self._active -= 1

    @property
    def active_count(self): return self._active