feat: sysops_agent — Docker/git self-management with auto-heal
Adds a new specialist agent that gives the AI system control over its own infrastructure: - sysops_tools.py: docker SDK (ps/logs/restart) + git CLI (pull/status/log) + Odoo channel notifier for autonomous action broadcasts - sysops_agent.py: BaseAgent subclass handling on-demand chat requests, auto_heal() triggered by health failures, and sweep() for audits - Background auto-heal loop (main.py): runs every 2 minutes, calls _get_failing_systems() and triggers auto_heal() when degraded - health.py: extracted _get_failing_systems() helper reused by both the /health/detailed endpoint and the auto-heal loop - docker-compose.yml: mount docker socket + /root/odoo workspace + SSH keys for git authentication - Dockerfile: add git to apt-get - requirements.txt: add docker==7.1.0 Python SDK Auto-heal behavior: - Detects failing containers, restarts them, notifies all bot DM channels - Ollama (192.168.2.9) is flagged as external and skipped - On-demand via chat: "restart agent", "check logs", "pull latest code" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -145,6 +145,9 @@ async def lifespan(app: FastAPI):
|
||||
except Exception as exc:
|
||||
logger.warning('Sweep coordinator not available: %s', exc)
|
||||
|
||||
# 9. Auto-heal background loop
|
||||
asyncio.create_task(_auto_heal_loop())
|
||||
|
||||
logger.info('ActiveBlue AI agent service started on port %d', settings.agent_service_port)
|
||||
yield
|
||||
|
||||
@@ -168,6 +171,13 @@ def _register_specialist_agents(agent_registry, peer_bus, odoo, llm_router) -> N
|
||||
except Exception as exc:
|
||||
logger.warning('Could not register finance_agent: %s', exc)
|
||||
|
||||
try:
|
||||
from .agents.sysops_agent import SysopsAgent
|
||||
agent_registry.register('sysops_agent', SysopsAgent(odoo=odoo, llm=llm_router, peer_bus=peer_bus))
|
||||
logger.info('sysops_agent registered')
|
||||
except Exception as exc:
|
||||
logger.warning('Could not register sysops_agent: %s', exc)
|
||||
|
||||
specialist_map = {
|
||||
'accounting_agent': 'AccountingAgent',
|
||||
'crm_agent': 'CrmAgent',
|
||||
@@ -189,6 +199,27 @@ def _register_specialist_agents(agent_registry, peer_bus, odoo, llm_router) -> N
|
||||
logger.warning('Could not register %s: %s', agent_name, exc)
|
||||
|
||||
|
||||
async def _auto_heal_loop(interval: int = 120) -> None:
|
||||
"""Check health every interval seconds; call sysops_agent.auto_heal() if degraded."""
|
||||
await asyncio.sleep(90) # let startup settle before first check
|
||||
while True:
|
||||
await asyncio.sleep(interval)
|
||||
try:
|
||||
from .routers.health import _get_failing_systems
|
||||
failing = await _get_failing_systems()
|
||||
if not failing:
|
||||
continue
|
||||
logger.warning('auto_heal_loop: failing systems: %s', failing)
|
||||
registry = app_state.get_agent_registry()
|
||||
sysops = registry.get_agent_instance('sysops_agent') if registry else None
|
||||
if sysops:
|
||||
await sysops.auto_heal(failing)
|
||||
else:
|
||||
logger.warning('auto_heal_loop: sysops_agent not registered, skipping')
|
||||
except Exception as exc:
|
||||
logger.warning('auto_heal_loop error: %s', exc)
|
||||
|
||||
|
||||
def _configure_logging(settings) -> None:
|
||||
level = getattr(logging, settings.log_level.upper(), logging.INFO)
|
||||
if settings.log_format == 'json':
|
||||
|
||||
Reference in New Issue
Block a user