feat: semantic deduplication of multiple photos of same receipt

After parsing all receipts, identify photos that are different shots of the same physical receipt by comparing amount + date + vendor similarity (difflib ratio >= 0.6). When a duplicate is found, keep whichever photo produced the most OCR text (clearest shot) and report the skipped ones. Zero-amount receipts (OCR failed entirely) are excluded from semantic dedup to avoid false positives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 01:56:30 -04:00
parent c2d1078d79
commit f90a2ee863
1 changed files with 65 additions and 2 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 import asyncio
+import difflib
 import json
 import logging
 from datetime import date as _date
@@ -143,15 +144,40 @@ class ExpensesAgent(BaseAgent):
            )
            for r in unique_receipts
        ]
-        parsed_list = await asyncio.gather(*parse_tasks, return_exceptions=True)
+        raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)

-        for receipt, parsed in zip(unique_receipts, parsed_list):
+        # Normalise exceptions to fallback dicts
+        paired: list[tuple[dict, dict]] = []
+        for receipt, parsed in zip(unique_receipts, raw_parsed):
            if isinstance(parsed, Exception):
                logger.warning('expenses_agent: parse failed for %s: %s',
                               receipt.get('filename'), parsed)
                parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
                          'date': receipt.get('date_from_name') or _date.today().isoformat(),
                          'product_name': ''}
+            paired.append((receipt, parsed))
+
+        # Semantic dedup — different photos of the same physical receipt share
+        # the same amount, date, and a similar vendor name.
+        deduped: list[tuple[dict, dict]] = []
+        for receipt, parsed in paired:
+            dup_idx = self._find_semantic_duplicate(parsed, deduped)
+            if dup_idx is not None:
+                # Keep whichever photo produced more OCR text (clearer shot)
+                existing_receipt, _ = deduped[dup_idx]
+                if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
+                    deduped[dup_idx] = (receipt, parsed)
+                actions.append(
+                    f"Skipped duplicate photo of "
+                    f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
+                    f" ${float(parsed.get('amount', 0)):.2f}"
+                )
+                logger.info('expenses_agent: semantic duplicate %s skipped',
+                            receipt.get('filename'))
+            else:
+                deduped.append((receipt, parsed))
+
+        for receipt, parsed in deduped:

            # Pick product by name match returned from LLM, fall back to default
            product_id = default_product_id
@@ -193,6 +219,43 @@ class ExpensesAgent(BaseAgent):
        self._actions_taken = actions
        return actions

+    @staticmethod
+    def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
+        """
+        Return the index in `candidates` of a receipt that appears to be the
+        same physical receipt as `parsed`, or None if no match found.
+
+        Match criteria (all must pass):
+          1. Same date
+          2. Amount > 0 and within $0.05 of each other
+          3. Vendor name similarity >= 60 %  (or both vendors are raw filenames)
+        """
+        amt = float(parsed.get('amount', 0))
+        date = parsed.get('date', '')
+        vendor = str(parsed.get('vendor', '')).lower().strip()
+        # If OCR failed the vendor is just a filename — can't dedup by content
+        is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
+
+        for idx, (_, other) in enumerate(candidates):
+            other_amt = float(other.get('amount', 0))
+            # Skip zero-amount receipts — too ambiguous to dedup
+            if amt == 0 or other_amt == 0:
+                continue
+            if abs(amt - other_amt) > 0.05:
+                continue
+            if date != other.get('date', ''):
+                continue
+            other_vendor = str(other.get('vendor', '')).lower().strip()
+            other_is_filename = other_vendor.endswith(
+                ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
+            if is_filename or other_is_filename:
+                # Same amount + date, no vendor text to compare — treat as dup
+                return idx
+            ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
+            if ratio >= 0.6:
+                return idx
+        return None
+
    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
                                   date_hint: str = None) -> dict: