feat(infra): add sweep coordinator, structured logging, test suite, and README
Sweep coordinator (Step 16): - SweepCoordinator runs all 8 agents in parallel with 60s per-agent / 300s total timeout - Aggregates findings, actions, errors into SweepCoordinatorResult - Registered in FastAPI lifespan; triggered via POST /sweep Structured logging (Step 18): - logging_utils/structured.py: JSONFormatter emitting ts/level/logger/msg + custom fields - log_directive_event() for structured directive lifecycle logging - push_to_loki() async Loki push (graceful no-op if LOKI_URL unset) - configure_logging() replaces root handler at startup Tests (Steps 17+19): - conftest.py: mock_odoo, mock_pool, mock_llm fixtures - test_tool_validator.py: 9 tests covering validation, coercion, hallucination stripping - test_llm_router.py: 6 tests covering local/cloud/hybrid modes and HIPAA enforcement - test_peer_bus.py: 6 tests covering registration, timeout, depth, circular detection - test_finance_agent.py: 10 tests covering all 6 steps + sweep + peer request - test_memory_manager.py: 3 tests covering context build + hard cap enforcement - test_dispatch_router.py: 3 tests covering dispatch, rate limit, health endpoint - test_odoo_client.py: 4 tests covering search_read, write result, unlink warning - test_e2e_dispatch.py: 2 E2E tests - full dispatch cycle + peer bus communication README (Step 20): architecture diagram, privacy modes, quick start, env vars, structure Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
100
agent_service/agents/sweep_coordinator.py
Normal file
100
agent_service/agents/sweep_coordinator.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_AGENT_NAMES = [
|
||||
'finance_agent', 'accounting_agent', 'crm_agent', 'sales_agent',
|
||||
'project_agent', 'elearning_agent', 'expenses_agent', 'employees_agent',
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SweepCoordinatorResult:
|
||||
total_agents: int = 0
|
||||
completed: int = 0
|
||||
failed: int = 0
|
||||
total_findings: int = 0
|
||||
total_actions: int = 0
|
||||
agent_results: list = field(default_factory=list)
|
||||
errors: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class SweepCoordinator:
|
||||
def __init__(self, peer_bus=None):
|
||||
self._peer_bus = peer_bus
|
||||
|
||||
async def run_sweep(self, agents: Optional[list[str]] = None) -> dict:
|
||||
names = agents or ALL_AGENT_NAMES
|
||||
result = SweepCoordinatorResult(total_agents=len(names))
|
||||
logger.info('SweepCoordinator: starting sweep for %d agents', len(names))
|
||||
|
||||
tasks = {}
|
||||
for name in names:
|
||||
agent = self._get_agent(name)
|
||||
if agent is None:
|
||||
logger.debug('Sweep: agent %s not available, skipping', name)
|
||||
result.failed += 1
|
||||
result.errors[name] = 'agent not available'
|
||||
continue
|
||||
tasks[name] = asyncio.create_task(self._run_agent_sweep(name, agent))
|
||||
|
||||
if not tasks:
|
||||
logger.warning('SweepCoordinator: no agents available for sweep')
|
||||
return result.__dict__
|
||||
|
||||
done, _ = await asyncio.wait(list(tasks.values()), timeout=300)
|
||||
|
||||
for name, task in tasks.items():
|
||||
if task not in done:
|
||||
logger.warning('Sweep timeout for agent %s', name)
|
||||
result.failed += 1
|
||||
result.errors[name] = 'timeout'
|
||||
task.cancel()
|
||||
continue
|
||||
try:
|
||||
sweep_report = task.result()
|
||||
result.completed += 1
|
||||
result.total_findings += len(sweep_report.findings)
|
||||
result.total_actions += len(sweep_report.actions)
|
||||
result.agent_results.append({
|
||||
'agent': name,
|
||||
'findings': sweep_report.findings,
|
||||
'actions': sweep_report.actions,
|
||||
'summary': sweep_report.summary,
|
||||
'error': sweep_report.error,
|
||||
})
|
||||
logger.info(
|
||||
'Sweep %s: %d findings, %d actions',
|
||||
name, len(sweep_report.findings), len(sweep_report.actions),
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error('Sweep result error for %s: %s', name, exc)
|
||||
result.failed += 1
|
||||
result.errors[name] = str(exc)
|
||||
|
||||
logger.info(
|
||||
'SweepCoordinator done: %d completed, %d failed, %d findings, %d actions',
|
||||
result.completed, result.failed, result.total_findings, result.total_actions,
|
||||
)
|
||||
return result.__dict__
|
||||
|
||||
async def _run_agent_sweep(self, name: str, agent):
|
||||
try:
|
||||
return await asyncio.wait_for(agent.sweep(), timeout=60)
|
||||
except asyncio.TimeoutError:
|
||||
from .base_agent import SweepReport
|
||||
logger.warning('Agent sweep timed out: %s', name)
|
||||
return SweepReport(agent=name, findings=[], actions=[], error='timeout')
|
||||
except Exception as exc:
|
||||
from .base_agent import SweepReport
|
||||
logger.error('Agent sweep error %s: %s', name, exc)
|
||||
return SweepReport(agent=name, findings=[], actions=[], error=str(exc))
|
||||
|
||||
def _get_agent(self, name: str):
|
||||
if self._peer_bus is None:
|
||||
return None
|
||||
return self._peer_bus.get_agent(name)
|
||||
Reference in New Issue
Block a user