Files
odoo-ai/agent_service/routers/health.py
Carlos Garcia 8d1727b498 feat: sysops_agent — Docker/git self-management with auto-heal
Adds a new specialist agent that gives the AI system control over its
own infrastructure:

- sysops_tools.py: docker SDK (ps/logs/restart) + git CLI (pull/status/log)
  + Odoo channel notifier for autonomous action broadcasts
- sysops_agent.py: BaseAgent subclass handling on-demand chat requests,
  auto_heal() triggered by health failures, and sweep() for audits
- Background auto-heal loop (main.py): runs every 2 minutes, calls
  _get_failing_systems() and triggers auto_heal() when degraded
- health.py: extracted _get_failing_systems() helper reused by both
  the /health/detailed endpoint and the auto-heal loop
- docker-compose.yml: mount docker socket + /root/odoo workspace +
  SSH keys for git authentication
- Dockerfile: add git to apt-get
- requirements.txt: add docker==7.1.0 Python SDK

Auto-heal behavior:
  - Detects failing containers, restarts them, notifies all bot DM channels
  - Ollama (192.168.2.9) is flagged as external and skipped
  - On-demand via chat: "restart agent", "check logs", "pull latest code"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 17:01:57 -04:00

121 lines
3.3 KiB
Python

from __future__ import annotations
import asyncio
import logging
import time
from fastapi import APIRouter
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix='/health', tags=['health'])
_start_time = time.time()
class HealthResponse(BaseModel):
status: str
uptime_seconds: float
class DetailedHealthResponse(BaseModel):
status: str
uptime_seconds: float
db: str
odoo: str
ollama: str
master_agent: str
privacy_mode: str
@router.get('', response_model=HealthResponse)
async def health():
return HealthResponse(status='ok', uptime_seconds=round(time.time() - _start_time, 1))
async def _get_failing_systems() -> list[str]:
"""Return a list of system names that are not reporting 'ok'."""
from ..app_state import get_db_pool, get_master_agent, get_llm_router
failing = []
pool = get_db_pool()
if not pool:
failing.append('db')
else:
try:
async with pool.acquire(timeout=5) as conn:
await conn.fetchval('SELECT 1')
except Exception:
failing.append('db')
master = get_master_agent()
if master is None:
failing.append('master_agent')
else:
if hasattr(master, '_odoo'):
try:
await asyncio.wait_for(master._odoo.ping(), timeout=5)
except Exception:
failing.append('odoo')
llm_router = get_llm_router()
if llm_router and hasattr(llm_router, '_ollama'):
try:
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
except Exception:
failing.append('ollama')
return failing
@router.get('/detailed', response_model=DetailedHealthResponse)
async def health_detailed():
from ..app_state import get_db_pool, get_master_agent, get_llm_router
from ..config import get_settings
uptime = round(time.time() - _start_time, 1)
settings = get_settings()
# DB check
db_status = 'unavailable'
pool = get_db_pool()
if pool:
try:
async with pool.acquire(timeout=5) as conn:
await conn.fetchval('SELECT 1')
db_status = 'ok'
except Exception as exc:
db_status = f'error: {exc}'
# Odoo check
odoo_status = 'unavailable'
master = get_master_agent()
if master and hasattr(master, '_odoo'):
try:
await asyncio.wait_for(master._odoo.ping(), timeout=5)
odoo_status = 'ok'
except Exception as exc:
odoo_status = f'error: {exc}'
# Ollama check
ollama_status = 'unavailable'
llm_router = get_llm_router()
if llm_router and hasattr(llm_router, '_ollama'):
try:
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
ollama_status = 'ok'
except Exception as exc:
ollama_status = f'error: {exc}'
master_status = 'ok' if master is not None else 'unavailable'
overall = 'ok' if all(s == 'ok' for s in [db_status, master_status]) else 'degraded'
return DetailedHealthResponse(
status=overall,
uptime_seconds=uptime,
db=db_status,
odoo=odoo_status,
ollama=ollama_status,
master_agent=master_status,
privacy_mode=settings.llm_privacy_mode,
)