diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 1970d22..a3efeff 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -1,5 +1,6 @@ from __future__ import annotations import asyncio +import difflib import json import logging from datetime import date as _date @@ -143,15 +144,40 @@ class ExpensesAgent(BaseAgent): ) for r in unique_receipts ] - parsed_list = await asyncio.gather(*parse_tasks, return_exceptions=True) + raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True) - for receipt, parsed in zip(unique_receipts, parsed_list): + # Normalise exceptions to fallback dicts + paired: list[tuple[dict, dict]] = [] + for receipt, parsed in zip(unique_receipts, raw_parsed): if isinstance(parsed, Exception): logger.warning('expenses_agent: parse failed for %s: %s', receipt.get('filename'), parsed) parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0, 'date': receipt.get('date_from_name') or _date.today().isoformat(), 'product_name': ''} + paired.append((receipt, parsed)) + + # Semantic dedup — different photos of the same physical receipt share + # the same amount, date, and a similar vendor name. + deduped: list[tuple[dict, dict]] = [] + for receipt, parsed in paired: + dup_idx = self._find_semantic_duplicate(parsed, deduped) + if dup_idx is not None: + # Keep whichever photo produced more OCR text (clearer shot) + existing_receipt, _ = deduped[dup_idx] + if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')): + deduped[dup_idx] = (receipt, parsed) + actions.append( + f"Skipped duplicate photo of " + f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}" + f" ${float(parsed.get('amount', 0)):.2f}" + ) + logger.info('expenses_agent: semantic duplicate %s skipped', + receipt.get('filename')) + else: + deduped.append((receipt, parsed)) + + for receipt, parsed in deduped: # Pick product by name match returned from LLM, fall back to default product_id = default_product_id @@ -193,6 +219,43 @@ class ExpensesAgent(BaseAgent): self._actions_taken = actions return actions + @staticmethod + def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None: + """ + Return the index in `candidates` of a receipt that appears to be the + same physical receipt as `parsed`, or None if no match found. + + Match criteria (all must pass): + 1. Same date + 2. Amount > 0 and within $0.05 of each other + 3. Vendor name similarity >= 60 % (or both vendors are raw filenames) + """ + amt = float(parsed.get('amount', 0)) + date = parsed.get('date', '') + vendor = str(parsed.get('vendor', '')).lower().strip() + # If OCR failed the vendor is just a filename — can't dedup by content + is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) + + for idx, (_, other) in enumerate(candidates): + other_amt = float(other.get('amount', 0)) + # Skip zero-amount receipts — too ambiguous to dedup + if amt == 0 or other_amt == 0: + continue + if abs(amt - other_amt) > 0.05: + continue + if date != other.get('date', ''): + continue + other_vendor = str(other.get('vendor', '')).lower().strip() + other_is_filename = other_vendor.endswith( + ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) + if is_filename or other_is_filename: + # Same amount + date, no vendor text to compare — treat as dup + return idx + ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio() + if ratio >= 0.6: + return idx + return None + async def _parse_receipt_text(self, text: str, filename: str, expense_products: list = None, date_hint: str = None) -> dict: