feat: semantic deduplication of multiple photos of same receipt

After parsing all receipts, identify photos that are different shots of the same physical receipt by comparing amount + date + vendor similarity (difflib ratio >= 0.6). When a duplicate is found, keep whichever photo produced the most OCR text (clearest shot) and report the skipped ones. Zero-amount receipts (OCR failed entirely) are excluded from semantic dedup to avoid false positives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 01:56:30 -04:00
parent c2d1078d79
commit f90a2ee863
1 changed files with 65 additions and 2 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 import asyncio
 import difflib
 import json
 import logging
 from datetime import date as _date
@@ -143,15 +144,40 @@ class ExpensesAgent(BaseAgent):
            )
            for r in unique_receipts
        ]
-        parsed_list = await asyncio.gather(*parse_tasks, return_exceptions=True)
+        raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)
-        for receipt, parsed in zip(unique_receipts, parsed_list):
+        # Normalise exceptions to fallback dicts
        paired: list[tuple[dict, dict]] = []
        for receipt, parsed in zip(unique_receipts, raw_parsed):
            if isinstance(parsed, Exception):
                logger.warning('expenses_agent: parse failed for %s: %s',
                               receipt.get('filename'), parsed)
                parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
                          'date': receipt.get('date_from_name') or _date.today().isoformat(),
                          'product_name': ''}
            paired.append((receipt, parsed))
        # Semantic dedup — different photos of the same physical receipt share
        # the same amount, date, and a similar vendor name.
        deduped: list[tuple[dict, dict]] = []
        for receipt, parsed in paired:
            dup_idx = self._find_semantic_duplicate(parsed, deduped)
            if dup_idx is not None:
                # Keep whichever photo produced more OCR text (clearer shot)
                existing_receipt, _ = deduped[dup_idx]
                if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
                    deduped[dup_idx] = (receipt, parsed)
                actions.append(
                    f"Skipped duplicate photo of "
                    f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
                    f" ${float(parsed.get('amount', 0)):.2f}"
                )
                logger.info('expenses_agent: semantic duplicate %s skipped',
                            receipt.get('filename'))
            else:
                deduped.append((receipt, parsed))
        for receipt, parsed in deduped:
            # Pick product by name match returned from LLM, fall back to default
            product_id = default_product_id
@@ -193,6 +219,43 @@ class ExpensesAgent(BaseAgent):
        self._actions_taken = actions
        return actions
    @staticmethod
    def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
        """
        Return the index in `candidates` of a receipt that appears to be the
        same physical receipt as `parsed`, or None if no match found.
        Match criteria (all must pass):
          1. Same date
          2. Amount > 0 and within $0.05 of each other
          3. Vendor name similarity >= 60 %  (or both vendors are raw filenames)
        """
        amt = float(parsed.get('amount', 0))
        date = parsed.get('date', '')
        vendor = str(parsed.get('vendor', '')).lower().strip()
        # If OCR failed the vendor is just a filename — can't dedup by content
        is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
        for idx, (_, other) in enumerate(candidates):
            other_amt = float(other.get('amount', 0))
            # Skip zero-amount receipts — too ambiguous to dedup
            if amt == 0 or other_amt == 0:
                continue
            if abs(amt - other_amt) > 0.05:
                continue
            if date != other.get('date', ''):
                continue
            other_vendor = str(other.get('vendor', '')).lower().strip()
            other_is_filename = other_vendor.endswith(
                ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
            if is_filename or other_is_filename:
                # Same amount + date, no vendor text to compare — treat as dup
                return idx
            ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
            if ratio >= 0.6:
                return idx
        return None
    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
                                   date_hint: str = None) -> dict: