feat: two-pass dedup catches same-vendor OCR amount misreads

Pass 1 unchanged: same date + amount within 0.05 + vendor similarity 60%.
Pass 2 (new): same vendor (>= 80% similarity) + same date, regardless
of amount, to catch receipts where OCR misread the total.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-16 16:48:51 -04:00
parent 774c0cc062
commit 12576ead1b

View File

@@ -254,12 +254,17 @@ class ExpensesAgent(BaseAgent):
Return the index in `candidates` of a receipt that appears to be the
same physical receipt as `parsed`, or None if no match found.
Match criteria (all must pass):
Pass 1 — exact-amount match (all must pass):
1. Same date
2. Amount > 0 and within $0.05 of each other
3. Transaction times within 30 min of each other (if both present);
times > 30 min apart rule out a duplicate
4. Vendor name similarity >= 60 % (or both vendors are raw filenames)
3. Transaction times within 30 min (if both present)
4. Vendor similarity >= 60 % (or both vendors are raw filenames)
Pass 2 — OCR-error match (amount may differ due to misread):
1. Same date
2. Both amounts > 0
3. Vendor similarity >= 80 % (stricter threshold compensates for loose amount)
4. Times within 30 min (if both present)
"""
amt = float(parsed.get('amount', 0))
date = parsed.get('date', '')
@@ -267,35 +272,52 @@ class ExpensesAgent(BaseAgent):
vendor = str(parsed.get('vendor', '')).lower().strip()
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
def _times_compatible(t1, t2) -> bool:
"""Return False only when both times are present and >30 min apart."""
if not (t1 and t2):
return True
try:
h1, m1 = (int(p) for p in t1.split(':')[:2])
h2, m2 = (int(p) for p in t2.split(':')[:2])
return abs((h1 * 60 + m1) - (h2 * 60 + m2)) <= 30
except Exception:
return True
# Pass 1: amount must match within $0.05
for idx, (_, other) in enumerate(candidates):
other_amt = float(other.get('amount', 0))
# Skip zero-amount receipts — too ambiguous to dedup
if amt == 0 or other_amt == 0:
continue
if abs(amt - other_amt) > 0.05:
continue
if date != other.get('date', ''):
continue
# Time check: if both receipts have a transaction time and they are
# more than 30 minutes apart they are different transactions.
other_time = other.get('time')
if time and other_time:
try:
h1, m1 = (int(p) for p in time.split(':')[:2])
h2, m2 = (int(p) for p in other_time.split(':')[:2])
if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30:
continue
except Exception:
pass # unparseable time — ignore the signal
if not _times_compatible(time, other.get('time')):
continue
other_vendor = str(other.get('vendor', '')).lower().strip()
other_is_filename = other_vendor.endswith(
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
if is_filename or other_is_filename:
# Same amount + date, no vendor text to compare — treat as dup
return idx
ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
if ratio >= 0.6:
if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.6:
return idx
# Pass 2: same vendor + same date even when amounts differ (OCR misread)
if not is_filename:
for idx, (_, other) in enumerate(candidates):
other_amt = float(other.get('amount', 0))
if amt == 0 or other_amt == 0:
continue
if date != other.get('date', ''):
continue
if not _times_compatible(time, other.get('time')):
continue
other_vendor = str(other.get('vendor', '')).lower().strip()
if other_vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')):
continue
if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.80:
return idx
return None
async def _parse_receipt_text(self, text: str, filename: str,