feat: semantic deduplication of multiple photos of same receipt
After parsing all receipts, identify photos that are different shots of the same physical receipt by comparing amount + date + vendor similarity (difflib ratio >= 0.6). When a duplicate is found, keep whichever photo produced the most OCR text (clearest shot) and report the skipped ones. Zero-amount receipts (OCR failed entirely) are excluded from semantic dedup to avoid false positives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import difflib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from datetime import date as _date
|
from datetime import date as _date
|
||||||
@@ -143,15 +144,40 @@ class ExpensesAgent(BaseAgent):
|
|||||||
)
|
)
|
||||||
for r in unique_receipts
|
for r in unique_receipts
|
||||||
]
|
]
|
||||||
parsed_list = await asyncio.gather(*parse_tasks, return_exceptions=True)
|
raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)
|
||||||
|
|
||||||
for receipt, parsed in zip(unique_receipts, parsed_list):
|
# Normalise exceptions to fallback dicts
|
||||||
|
paired: list[tuple[dict, dict]] = []
|
||||||
|
for receipt, parsed in zip(unique_receipts, raw_parsed):
|
||||||
if isinstance(parsed, Exception):
|
if isinstance(parsed, Exception):
|
||||||
logger.warning('expenses_agent: parse failed for %s: %s',
|
logger.warning('expenses_agent: parse failed for %s: %s',
|
||||||
receipt.get('filename'), parsed)
|
receipt.get('filename'), parsed)
|
||||||
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
||||||
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
||||||
'product_name': ''}
|
'product_name': ''}
|
||||||
|
paired.append((receipt, parsed))
|
||||||
|
|
||||||
|
# Semantic dedup — different photos of the same physical receipt share
|
||||||
|
# the same amount, date, and a similar vendor name.
|
||||||
|
deduped: list[tuple[dict, dict]] = []
|
||||||
|
for receipt, parsed in paired:
|
||||||
|
dup_idx = self._find_semantic_duplicate(parsed, deduped)
|
||||||
|
if dup_idx is not None:
|
||||||
|
# Keep whichever photo produced more OCR text (clearer shot)
|
||||||
|
existing_receipt, _ = deduped[dup_idx]
|
||||||
|
if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
|
||||||
|
deduped[dup_idx] = (receipt, parsed)
|
||||||
|
actions.append(
|
||||||
|
f"Skipped duplicate photo of "
|
||||||
|
f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
|
||||||
|
f" ${float(parsed.get('amount', 0)):.2f}"
|
||||||
|
)
|
||||||
|
logger.info('expenses_agent: semantic duplicate %s skipped',
|
||||||
|
receipt.get('filename'))
|
||||||
|
else:
|
||||||
|
deduped.append((receipt, parsed))
|
||||||
|
|
||||||
|
for receipt, parsed in deduped:
|
||||||
|
|
||||||
# Pick product by name match returned from LLM, fall back to default
|
# Pick product by name match returned from LLM, fall back to default
|
||||||
product_id = default_product_id
|
product_id = default_product_id
|
||||||
@@ -193,6 +219,43 @@ class ExpensesAgent(BaseAgent):
|
|||||||
self._actions_taken = actions
|
self._actions_taken = actions
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
|
||||||
|
"""
|
||||||
|
Return the index in `candidates` of a receipt that appears to be the
|
||||||
|
same physical receipt as `parsed`, or None if no match found.
|
||||||
|
|
||||||
|
Match criteria (all must pass):
|
||||||
|
1. Same date
|
||||||
|
2. Amount > 0 and within $0.05 of each other
|
||||||
|
3. Vendor name similarity >= 60 % (or both vendors are raw filenames)
|
||||||
|
"""
|
||||||
|
amt = float(parsed.get('amount', 0))
|
||||||
|
date = parsed.get('date', '')
|
||||||
|
vendor = str(parsed.get('vendor', '')).lower().strip()
|
||||||
|
# If OCR failed the vendor is just a filename — can't dedup by content
|
||||||
|
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||||
|
|
||||||
|
for idx, (_, other) in enumerate(candidates):
|
||||||
|
other_amt = float(other.get('amount', 0))
|
||||||
|
# Skip zero-amount receipts — too ambiguous to dedup
|
||||||
|
if amt == 0 or other_amt == 0:
|
||||||
|
continue
|
||||||
|
if abs(amt - other_amt) > 0.05:
|
||||||
|
continue
|
||||||
|
if date != other.get('date', ''):
|
||||||
|
continue
|
||||||
|
other_vendor = str(other.get('vendor', '')).lower().strip()
|
||||||
|
other_is_filename = other_vendor.endswith(
|
||||||
|
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||||
|
if is_filename or other_is_filename:
|
||||||
|
# Same amount + date, no vendor text to compare — treat as dup
|
||||||
|
return idx
|
||||||
|
ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
|
||||||
|
if ratio >= 0.6:
|
||||||
|
return idx
|
||||||
|
return None
|
||||||
|
|
||||||
async def _parse_receipt_text(self, text: str, filename: str,
|
async def _parse_receipt_text(self, text: str, filename: str,
|
||||||
expense_products: list = None,
|
expense_products: list = None,
|
||||||
date_hint: str = None) -> dict:
|
date_hint: str = None) -> dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user