feat: sysops_agent — Docker/git self-management with auto-heal
Adds a new specialist agent that gives the AI system control over its own infrastructure: - sysops_tools.py: docker SDK (ps/logs/restart) + git CLI (pull/status/log) + Odoo channel notifier for autonomous action broadcasts - sysops_agent.py: BaseAgent subclass handling on-demand chat requests, auto_heal() triggered by health failures, and sweep() for audits - Background auto-heal loop (main.py): runs every 2 minutes, calls _get_failing_systems() and triggers auto_heal() when degraded - health.py: extracted _get_failing_systems() helper reused by both the /health/detailed endpoint and the auto-heal loop - docker-compose.yml: mount docker socket + /root/odoo workspace + SSH keys for git authentication - Dockerfile: add git to apt-get - requirements.txt: add docker==7.1.0 Python SDK Auto-heal behavior: - Detects failing containers, restarts them, notifies all bot DM channels - Ollama (192.168.2.9) is flagged as external and skipped - On-demand via chat: "restart agent", "check logs", "pull latest code" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,41 @@ async def health():
|
||||
return HealthResponse(status='ok', uptime_seconds=round(time.time() - _start_time, 1))
|
||||
|
||||
|
||||
async def _get_failing_systems() -> list[str]:
|
||||
"""Return a list of system names that are not reporting 'ok'."""
|
||||
from ..app_state import get_db_pool, get_master_agent, get_llm_router
|
||||
failing = []
|
||||
|
||||
pool = get_db_pool()
|
||||
if not pool:
|
||||
failing.append('db')
|
||||
else:
|
||||
try:
|
||||
async with pool.acquire(timeout=5) as conn:
|
||||
await conn.fetchval('SELECT 1')
|
||||
except Exception:
|
||||
failing.append('db')
|
||||
|
||||
master = get_master_agent()
|
||||
if master is None:
|
||||
failing.append('master_agent')
|
||||
else:
|
||||
if hasattr(master, '_odoo'):
|
||||
try:
|
||||
await asyncio.wait_for(master._odoo.ping(), timeout=5)
|
||||
except Exception:
|
||||
failing.append('odoo')
|
||||
|
||||
llm_router = get_llm_router()
|
||||
if llm_router and hasattr(llm_router, '_ollama'):
|
||||
try:
|
||||
await asyncio.wait_for(llm_router._ollama.ping(), timeout=5)
|
||||
except Exception:
|
||||
failing.append('ollama')
|
||||
|
||||
return failing
|
||||
|
||||
|
||||
@router.get('/detailed', response_model=DetailedHealthResponse)
|
||||
async def health_detailed():
|
||||
from ..app_state import get_db_pool, get_master_agent, get_llm_router
|
||||
|
||||
Reference in New Issue
Block a user