After parsing all receipts, identify photos that are different shots of the same physical receipt by comparing amount + date + vendor similarity (difflib ratio >= 0.6). When a duplicate is found, keep whichever photo produced the most OCR text (clearest shot) and report the skipped ones. Zero-amount receipts (OCR failed entirely) are excluded from semantic dedup to avoid false positives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
395 lines
18 KiB
Python
395 lines
18 KiB
Python
from __future__ import annotations
|
|
import asyncio
|
|
import difflib
|
|
import json
|
|
import logging
|
|
from datetime import date as _date
|
|
from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport
|
|
from ..tools.expenses_tools import ExpensesTools
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EXPENSES_TOOLS = [
|
|
{'name': 'get_expenses', 'description': 'Retrieve expense records',
|
|
'parameters': {'employee_id': {'type': 'integer', 'optional': True},
|
|
'state': {'type': 'string', 'optional': True},
|
|
'date_from': {'type': 'string', 'optional': True},
|
|
'date_to': {'type': 'string', 'optional': True},
|
|
'limit': {'type': 'integer', 'optional': True}}},
|
|
{'name': 'get_expense_sheets', 'description': 'Get expense report sheets',
|
|
'parameters': {'state': {'type': 'string', 'optional': True},
|
|
'employee_id': {'type': 'integer', 'optional': True},
|
|
'limit': {'type': 'integer', 'optional': True}}},
|
|
{'name': 'get_pending_approvals', 'description': 'Get expense sheets pending approval',
|
|
'parameters': {}},
|
|
{'name': 'approve_expense_sheet', 'description': 'Approve an expense sheet',
|
|
'parameters': {'sheet_id': {'type': 'integer'}}},
|
|
{'name': 'get_expenses_summary', 'description': 'Get expense summary for a period',
|
|
'parameters': {'date_from': {'type': 'string', 'optional': True},
|
|
'date_to': {'type': 'string', 'optional': True}}},
|
|
{'name': 'get_expense_by_employee', 'description': 'Get expenses for a specific employee',
|
|
'parameters': {'employee_id': {'type': 'integer'},
|
|
'limit': {'type': 'integer', 'optional': True}}},
|
|
{'name': 'flag_for_review', 'description': 'Flag an expense for review',
|
|
'parameters': {'model': {'type': 'string'}, 'record_id': {'type': 'integer'},
|
|
'reason': {'type': 'string'},
|
|
'severity': {'type': 'string', 'optional': True}}},
|
|
{'name': 'post_chatter_note', 'description': 'Post a note on a record',
|
|
'parameters': {'model': {'type': 'string'}, 'record_id': {'type': 'integer'},
|
|
'note': {'type': 'string'}}},
|
|
]
|
|
|
|
|
|
class ExpensesAgent(BaseAgent):
|
|
name = 'expenses_agent'
|
|
domain = 'expenses'
|
|
required_odoo_module = 'hr_expense'
|
|
system_prompt_file = 'expenses_system.txt'
|
|
tools = EXPENSES_TOOLS
|
|
|
|
def __init__(self, odoo, llm, peer_bus=None):
|
|
super().__init__(odoo, llm, peer_bus)
|
|
self._et = ExpensesTools(odoo)
|
|
self._gathered_data: dict = {}
|
|
self._actions_taken: list = []
|
|
self._escalations_list: list = []
|
|
|
|
async def _plan(self) -> dict:
|
|
task = (self._directive.task if self._directive else '').lower()
|
|
receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
|
|
return {
|
|
'mode': 'create_from_receipts' if receipts else 'read',
|
|
'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
|
|
'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
|
|
'employee_id': self._directive.params.get('employee_id') if self._directive else None,
|
|
'date_from': self._directive.params.get('date_from') if self._directive else None,
|
|
'date_to': self._directive.params.get('date_to') if self._directive else None,
|
|
}
|
|
|
|
async def _gather(self, plan: dict) -> dict:
|
|
data: dict = {'mode': plan.get('mode', 'read')}
|
|
if plan.get('mode') == 'create_from_receipts':
|
|
self._gathered_data = data
|
|
return data
|
|
data['summary'] = await self._et.get_expenses_summary(
|
|
date_from=plan.get('date_from'), date_to=plan.get('date_to'),
|
|
)
|
|
if plan.get('fetch_pending'):
|
|
data['pending'] = await self._et.get_pending_approvals()
|
|
self._gathered_data = data
|
|
return data
|
|
|
|
async def _reason(self) -> dict:
|
|
data = self._gathered_data
|
|
analysis: dict = {'escalations': [], 'flags': []}
|
|
if data.get('mode') == 'create_from_receipts':
|
|
self._escalations_list = []
|
|
return analysis
|
|
summary = data.get('summary', {})
|
|
if summary.get('pending_approval_count', 0) > 10:
|
|
analysis['escalations'].append(
|
|
f'{summary["pending_approval_count"]} expense sheets pending approval.'
|
|
)
|
|
self._escalations_list = analysis['escalations']
|
|
return analysis
|
|
|
|
async def _act(self, reasoning: dict) -> list:
|
|
if self._gathered_data.get('mode') != 'create_from_receipts':
|
|
return []
|
|
receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
|
|
if not receipts:
|
|
return []
|
|
|
|
user_id = (self._directive.context.peer_data.get('requesting_user_id')
|
|
if self._directive else None)
|
|
employee_id = await self._et.get_employee_id_for_user(user_id)
|
|
if not employee_id:
|
|
self._escalations_list.append(
|
|
'No employee record found for the current user; cannot create expense report.')
|
|
return []
|
|
|
|
sheet_name = f'Expense Report - {_date.today().isoformat()}'
|
|
sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
|
|
if not sheet_result.success:
|
|
self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
|
|
return []
|
|
|
|
sheet_id = sheet_result.record_id
|
|
actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
|
|
|
|
# Fetch all expensable products once for category selection
|
|
expense_products = await self._et.get_expense_products()
|
|
default_product_id = expense_products[0]['id'] if expense_products else None
|
|
product_map = {p['id']: p['name'] for p in expense_products}
|
|
|
|
# Deduplicate receipts by SHA256 hash — same image uploaded twice
|
|
seen_hashes: set = set()
|
|
unique_receipts = []
|
|
for r in receipts:
|
|
h = r.get('sha256')
|
|
if h:
|
|
if h in seen_hashes:
|
|
logger.info('expenses_agent: skipping duplicate receipt %s', r.get('filename'))
|
|
actions.append(f"Skipped duplicate: {r.get('filename', 'receipt')}")
|
|
continue
|
|
seen_hashes.add(h)
|
|
unique_receipts.append(r)
|
|
|
|
# Parse all receipts concurrently (bounded by Ollama semaphore)
|
|
parse_tasks = [
|
|
self._parse_receipt_text(
|
|
r.get('text', ''), r.get('filename', 'receipt'),
|
|
expense_products=expense_products,
|
|
date_hint=r.get('date_from_name'),
|
|
)
|
|
for r in unique_receipts
|
|
]
|
|
raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)
|
|
|
|
# Normalise exceptions to fallback dicts
|
|
paired: list[tuple[dict, dict]] = []
|
|
for receipt, parsed in zip(unique_receipts, raw_parsed):
|
|
if isinstance(parsed, Exception):
|
|
logger.warning('expenses_agent: parse failed for %s: %s',
|
|
receipt.get('filename'), parsed)
|
|
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
|
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
|
'product_name': ''}
|
|
paired.append((receipt, parsed))
|
|
|
|
# Semantic dedup — different photos of the same physical receipt share
|
|
# the same amount, date, and a similar vendor name.
|
|
deduped: list[tuple[dict, dict]] = []
|
|
for receipt, parsed in paired:
|
|
dup_idx = self._find_semantic_duplicate(parsed, deduped)
|
|
if dup_idx is not None:
|
|
# Keep whichever photo produced more OCR text (clearer shot)
|
|
existing_receipt, _ = deduped[dup_idx]
|
|
if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
|
|
deduped[dup_idx] = (receipt, parsed)
|
|
actions.append(
|
|
f"Skipped duplicate photo of "
|
|
f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
|
|
f" ${float(parsed.get('amount', 0)):.2f}"
|
|
)
|
|
logger.info('expenses_agent: semantic duplicate %s skipped',
|
|
receipt.get('filename'))
|
|
else:
|
|
deduped.append((receipt, parsed))
|
|
|
|
for receipt, parsed in deduped:
|
|
|
|
# Pick product by name match returned from LLM, fall back to default
|
|
product_id = default_product_id
|
|
chosen_name = parsed.get('product_name', '')
|
|
if chosen_name:
|
|
for p in expense_products:
|
|
if p['name'].lower() == chosen_name.lower():
|
|
product_id = p['id']
|
|
break
|
|
|
|
expense_result = await self._et.create_expense(
|
|
sheet_id=sheet_id,
|
|
employee_id=employee_id,
|
|
name=str(parsed.get('vendor', receipt.get('filename', 'Expense')))[:64],
|
|
total_amount=float(parsed.get('amount', 0.0)),
|
|
date=str(parsed.get('date') or _date.today().isoformat()),
|
|
product_id=product_id,
|
|
)
|
|
if expense_result.success:
|
|
cat = product_map.get(product_id, 'Expense')
|
|
actions.append(
|
|
f"Added: {parsed.get('vendor', 'Unknown vendor')} "
|
|
f"${float(parsed.get('amount', 0)):.2f} "
|
|
f"({cat}) on {parsed.get('date', 'today')}"
|
|
)
|
|
if receipt.get('b64'):
|
|
await self._et.attach_receipt(
|
|
'hr.expense', expense_result.record_id,
|
|
receipt.get('filename', 'receipt'),
|
|
receipt['b64'],
|
|
receipt.get('mimetype', 'application/octet-stream'),
|
|
)
|
|
else:
|
|
actions.append(
|
|
f"Could not create expense for {receipt.get('filename', 'receipt')}: "
|
|
f"{expense_result.error}"
|
|
)
|
|
|
|
self._actions_taken = actions
|
|
return actions
|
|
|
|
@staticmethod
|
|
def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
|
|
"""
|
|
Return the index in `candidates` of a receipt that appears to be the
|
|
same physical receipt as `parsed`, or None if no match found.
|
|
|
|
Match criteria (all must pass):
|
|
1. Same date
|
|
2. Amount > 0 and within $0.05 of each other
|
|
3. Vendor name similarity >= 60 % (or both vendors are raw filenames)
|
|
"""
|
|
amt = float(parsed.get('amount', 0))
|
|
date = parsed.get('date', '')
|
|
vendor = str(parsed.get('vendor', '')).lower().strip()
|
|
# If OCR failed the vendor is just a filename — can't dedup by content
|
|
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
|
|
|
for idx, (_, other) in enumerate(candidates):
|
|
other_amt = float(other.get('amount', 0))
|
|
# Skip zero-amount receipts — too ambiguous to dedup
|
|
if amt == 0 or other_amt == 0:
|
|
continue
|
|
if abs(amt - other_amt) > 0.05:
|
|
continue
|
|
if date != other.get('date', ''):
|
|
continue
|
|
other_vendor = str(other.get('vendor', '')).lower().strip()
|
|
other_is_filename = other_vendor.endswith(
|
|
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
|
if is_filename or other_is_filename:
|
|
# Same amount + date, no vendor text to compare — treat as dup
|
|
return idx
|
|
ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
|
|
if ratio >= 0.6:
|
|
return idx
|
|
return None
|
|
|
|
async def _parse_receipt_text(self, text: str, filename: str,
|
|
expense_products: list = None,
|
|
date_hint: str = None) -> dict:
|
|
today = _date.today().isoformat()
|
|
fallback = {'vendor': filename, 'amount': 0.0,
|
|
'date': date_hint or today, 'product_name': ''}
|
|
ocr_failed = not text or text.startswith('[')
|
|
|
|
product_list = ''
|
|
if expense_products:
|
|
names = [p['name'] for p in expense_products]
|
|
product_list = ', '.join(f'"{n}"' for n in names)
|
|
|
|
if ocr_failed:
|
|
# No OCR text — still try to classify category from filename/date
|
|
if not product_list:
|
|
return fallback
|
|
prompt = (
|
|
f'A receipt photo named "{filename}" could not be read by OCR. '
|
|
f'Based only on the filename, pick the most likely expense category '
|
|
f'from this list: [{product_list}]. '
|
|
f'Return ONLY valid JSON: {{"product_name": "..."}}'
|
|
)
|
|
else:
|
|
prompt = (
|
|
'Extract expense details from the following receipt text. '
|
|
'Return ONLY valid JSON with these keys:\n'
|
|
'"vendor" (string, merchant or restaurant name),\n'
|
|
'"amount" (number — the FINAL total the customer paid; '
|
|
'this is labeled "Total", "Amount Due", "Grand Total", or the last dollar figure; '
|
|
'do NOT use subtotal, tax, or tip separately; '
|
|
'if multiple totals appear pick the largest one labeled as the final total),\n'
|
|
f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n'
|
|
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
|
|
f'Receipt text:\n{text[:2000]}\n\nJSON only:'
|
|
)
|
|
try:
|
|
resp = await self._llm.submit(
|
|
[{'role': 'user', 'content': prompt}],
|
|
caller='expenses_agent_receipt_parser',
|
|
)
|
|
raw = (resp.content or '').strip()
|
|
first, last = raw.find('{'), raw.rfind('}')
|
|
if first != -1 and last > first:
|
|
data = json.loads(raw[first:last + 1])
|
|
return {
|
|
'vendor': str(data.get('vendor', filename)),
|
|
'amount': float(data.get('amount', 0.0)),
|
|
'date': str(data.get('date') or date_hint or today),
|
|
'product_name': str(data.get('product_name', '')),
|
|
}
|
|
except Exception as exc:
|
|
logger.warning('Receipt parse failed for %s: %s', filename, exc)
|
|
return fallback
|
|
|
|
async def _report(self) -> AgentReport:
|
|
data = self._gathered_data
|
|
directive_id = self._directive.directive_id if self._directive else ''
|
|
|
|
if data.get('mode') == 'create_from_receipts':
|
|
if self._actions_taken:
|
|
lines = '\n'.join(f' • {a}' for a in self._actions_taken)
|
|
summary = (
|
|
f'Expense report created successfully:\n{lines}\n\n'
|
|
'The report is in draft. Please open Odoo > Expenses, '
|
|
'review the entries, and click Submit to send for approval.'
|
|
)
|
|
status = 'complete'
|
|
else:
|
|
summary = ('Could not create expense report. ' +
|
|
'; '.join(self._escalations_list or ['Unknown error']))
|
|
status = 'failed'
|
|
return AgentReport(
|
|
directive_id=directive_id, agent=self.name, status=status,
|
|
summary=summary, data=data,
|
|
escalations=self._escalations_list, actions_taken=self._actions_taken)
|
|
|
|
summary_data = data.get('summary', {})
|
|
parts = []
|
|
if summary_data:
|
|
parts.append(
|
|
f'Expenses: {summary_data.get("total_expenses", 0)} records, '
|
|
f'total ${summary_data.get("total_amount", 0):.2f}. '
|
|
f'{summary_data.get("pending_approval_count", 0)} pending approval.'
|
|
)
|
|
if not parts:
|
|
parts.append('Expenses review complete.')
|
|
return AgentReport(
|
|
directive_id=directive_id, agent=self.name, status='complete',
|
|
summary='\n'.join(parts), data=data,
|
|
escalations=self._escalations_list, actions_taken=[])
|
|
|
|
async def _dispatch_tool(self, name: str, args: dict):
|
|
dispatch = {
|
|
'get_expenses': self._et.get_expenses,
|
|
'get_expense_sheets': self._et.get_expense_sheets,
|
|
'get_pending_approvals': self._et.get_pending_approvals,
|
|
'approve_expense_sheet': self._et.approve_expense_sheet,
|
|
'get_expenses_summary': self._et.get_expenses_summary,
|
|
'get_expense_by_employee': self._et.get_expense_by_employee,
|
|
'flag_for_review': self._et.flag_for_review,
|
|
'post_chatter_note': self._et.post_chatter_note,
|
|
}
|
|
if name not in dispatch:
|
|
raise ValueError(f'Unknown tool: {name}')
|
|
return await dispatch[name](**args)
|
|
|
|
async def handle_peer_request(self, request: dict) -> dict:
|
|
req_type = request.get('type', '')
|
|
try:
|
|
if req_type == 'expenses_summary':
|
|
return await self._et.get_expenses_summary()
|
|
if req_type == 'employee_expenses':
|
|
return {'expenses': await self._et.get_expense_by_employee(
|
|
employee_id=request['employee_id'])}
|
|
return {'error': f'Unknown type: {req_type}'}
|
|
except Exception as exc:
|
|
return {'error': str(exc)}
|
|
|
|
async def sweep(self) -> SweepReport:
|
|
findings = []
|
|
try:
|
|
pending = await self._et.get_pending_approvals()
|
|
for sheet in pending:
|
|
emp = sheet.get('employee_id', [0, ''])
|
|
findings.append({
|
|
'type': 'pending_expense_approval',
|
|
'sheet_id': sheet.get('id'),
|
|
'employee': emp[1] if isinstance(emp, list) else '',
|
|
'amount': sheet.get('total_amount', 0),
|
|
'severity': 'low',
|
|
})
|
|
except Exception as exc:
|
|
return SweepReport(agent=self.name, findings=[], error=str(exc))
|
|
return SweepReport(agent=self.name, findings=findings, actions_taken=[],
|
|
summary=f'Expenses sweep: {len(findings)} pending approvals.')
|