feat: semantic deduplication of multiple photos of same receipt

After parsing all receipts, identify photos that are different shots of
the same physical receipt by comparing amount + date + vendor similarity
(difflib ratio >= 0.6). When a duplicate is found, keep whichever photo
produced the most OCR text (clearest shot) and report the skipped ones.

Zero-amount receipts (OCR failed entirely) are excluded from semantic
dedup to avoid false positives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-16 01:56:30 -04:00
parent c2d1078d79
commit f90a2ee863

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import asyncio
import difflib
import json
import logging
from datetime import date as _date
@@ -143,15 +144,40 @@ class ExpensesAgent(BaseAgent):
)
for r in unique_receipts
]
parsed_list = await asyncio.gather(*parse_tasks, return_exceptions=True)
raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)
for receipt, parsed in zip(unique_receipts, parsed_list):
# Normalise exceptions to fallback dicts
paired: list[tuple[dict, dict]] = []
for receipt, parsed in zip(unique_receipts, raw_parsed):
if isinstance(parsed, Exception):
logger.warning('expenses_agent: parse failed for %s: %s',
receipt.get('filename'), parsed)
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
'date': receipt.get('date_from_name') or _date.today().isoformat(),
'product_name': ''}
paired.append((receipt, parsed))
# Semantic dedup — different photos of the same physical receipt share
# the same amount, date, and a similar vendor name.
deduped: list[tuple[dict, dict]] = []
for receipt, parsed in paired:
dup_idx = self._find_semantic_duplicate(parsed, deduped)
if dup_idx is not None:
# Keep whichever photo produced more OCR text (clearer shot)
existing_receipt, _ = deduped[dup_idx]
if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
deduped[dup_idx] = (receipt, parsed)
actions.append(
f"Skipped duplicate photo of "
f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
f" ${float(parsed.get('amount', 0)):.2f}"
)
logger.info('expenses_agent: semantic duplicate %s skipped',
receipt.get('filename'))
else:
deduped.append((receipt, parsed))
for receipt, parsed in deduped:
# Pick product by name match returned from LLM, fall back to default
product_id = default_product_id
@@ -193,6 +219,43 @@ class ExpensesAgent(BaseAgent):
self._actions_taken = actions
return actions
@staticmethod
def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
"""
Return the index in `candidates` of a receipt that appears to be the
same physical receipt as `parsed`, or None if no match found.
Match criteria (all must pass):
1. Same date
2. Amount > 0 and within $0.05 of each other
3. Vendor name similarity >= 60 % (or both vendors are raw filenames)
"""
amt = float(parsed.get('amount', 0))
date = parsed.get('date', '')
vendor = str(parsed.get('vendor', '')).lower().strip()
# If OCR failed the vendor is just a filename — can't dedup by content
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
for idx, (_, other) in enumerate(candidates):
other_amt = float(other.get('amount', 0))
# Skip zero-amount receipts — too ambiguous to dedup
if amt == 0 or other_amt == 0:
continue
if abs(amt - other_amt) > 0.05:
continue
if date != other.get('date', ''):
continue
other_vendor = str(other.get('vendor', '')).lower().strip()
other_is_filename = other_vendor.endswith(
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
if is_filename or other_is_filename:
# Same amount + date, no vendor text to compare — treat as dup
return idx
ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
if ratio >= 0.6:
return idx
return None
async def _parse_receipt_text(self, text: str, filename: str,
expense_products: list = None,
date_hint: str = None) -> dict: