feat: semantic deduplication of multiple photos of same receipt
After parsing all receipts, identify photos that are different shots of the same physical receipt by comparing amount + date + vendor similarity (difflib ratio >= 0.6). When a duplicate is found, keep whichever photo produced the most OCR text (clearest shot) and report the skipped ones. Zero-amount receipts (OCR failed entirely) are excluded from semantic dedup to avoid false positives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import difflib
|
||||
import json
|
||||
import logging
|
||||
from datetime import date as _date
|
||||
@@ -143,15 +144,40 @@ class ExpensesAgent(BaseAgent):
|
||||
)
|
||||
for r in unique_receipts
|
||||
]
|
||||
parsed_list = await asyncio.gather(*parse_tasks, return_exceptions=True)
|
||||
raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)
|
||||
|
||||
for receipt, parsed in zip(unique_receipts, parsed_list):
|
||||
# Normalise exceptions to fallback dicts
|
||||
paired: list[tuple[dict, dict]] = []
|
||||
for receipt, parsed in zip(unique_receipts, raw_parsed):
|
||||
if isinstance(parsed, Exception):
|
||||
logger.warning('expenses_agent: parse failed for %s: %s',
|
||||
receipt.get('filename'), parsed)
|
||||
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
||||
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
||||
'product_name': ''}
|
||||
paired.append((receipt, parsed))
|
||||
|
||||
# Semantic dedup — different photos of the same physical receipt share
|
||||
# the same amount, date, and a similar vendor name.
|
||||
deduped: list[tuple[dict, dict]] = []
|
||||
for receipt, parsed in paired:
|
||||
dup_idx = self._find_semantic_duplicate(parsed, deduped)
|
||||
if dup_idx is not None:
|
||||
# Keep whichever photo produced more OCR text (clearer shot)
|
||||
existing_receipt, _ = deduped[dup_idx]
|
||||
if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
|
||||
deduped[dup_idx] = (receipt, parsed)
|
||||
actions.append(
|
||||
f"Skipped duplicate photo of "
|
||||
f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
|
||||
f" ${float(parsed.get('amount', 0)):.2f}"
|
||||
)
|
||||
logger.info('expenses_agent: semantic duplicate %s skipped',
|
||||
receipt.get('filename'))
|
||||
else:
|
||||
deduped.append((receipt, parsed))
|
||||
|
||||
for receipt, parsed in deduped:
|
||||
|
||||
# Pick product by name match returned from LLM, fall back to default
|
||||
product_id = default_product_id
|
||||
@@ -193,6 +219,43 @@ class ExpensesAgent(BaseAgent):
|
||||
self._actions_taken = actions
|
||||
return actions
|
||||
|
||||
@staticmethod
|
||||
def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
|
||||
"""
|
||||
Return the index in `candidates` of a receipt that appears to be the
|
||||
same physical receipt as `parsed`, or None if no match found.
|
||||
|
||||
Match criteria (all must pass):
|
||||
1. Same date
|
||||
2. Amount > 0 and within $0.05 of each other
|
||||
3. Vendor name similarity >= 60 % (or both vendors are raw filenames)
|
||||
"""
|
||||
amt = float(parsed.get('amount', 0))
|
||||
date = parsed.get('date', '')
|
||||
vendor = str(parsed.get('vendor', '')).lower().strip()
|
||||
# If OCR failed the vendor is just a filename — can't dedup by content
|
||||
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||
|
||||
for idx, (_, other) in enumerate(candidates):
|
||||
other_amt = float(other.get('amount', 0))
|
||||
# Skip zero-amount receipts — too ambiguous to dedup
|
||||
if amt == 0 or other_amt == 0:
|
||||
continue
|
||||
if abs(amt - other_amt) > 0.05:
|
||||
continue
|
||||
if date != other.get('date', ''):
|
||||
continue
|
||||
other_vendor = str(other.get('vendor', '')).lower().strip()
|
||||
other_is_filename = other_vendor.endswith(
|
||||
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||
if is_filename or other_is_filename:
|
||||
# Same amount + date, no vendor text to compare — treat as dup
|
||||
return idx
|
||||
ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
|
||||
if ratio >= 0.6:
|
||||
return idx
|
||||
return None
|
||||
|
||||
async def _parse_receipt_text(self, text: str, filename: str,
|
||||
expense_products: list = None,
|
||||
date_hint: str = None) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user