feat: two-pass dedup catches same-vendor OCR amount misreads
Pass 1 unchanged: same date + amount within 0.05 + vendor similarity 60%. Pass 2 (new): same vendor (>= 80% similarity) + same date, regardless of amount, to catch receipts where OCR misread the total. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -254,12 +254,17 @@ class ExpensesAgent(BaseAgent):
|
||||
Return the index in `candidates` of a receipt that appears to be the
|
||||
same physical receipt as `parsed`, or None if no match found.
|
||||
|
||||
Match criteria (all must pass):
|
||||
Pass 1 — exact-amount match (all must pass):
|
||||
1. Same date
|
||||
2. Amount > 0 and within $0.05 of each other
|
||||
3. Transaction times within 30 min of each other (if both present);
|
||||
times > 30 min apart rule out a duplicate
|
||||
4. Vendor name similarity >= 60 % (or both vendors are raw filenames)
|
||||
3. Transaction times within 30 min (if both present)
|
||||
4. Vendor similarity >= 60 % (or both vendors are raw filenames)
|
||||
|
||||
Pass 2 — OCR-error match (amount may differ due to misread):
|
||||
1. Same date
|
||||
2. Both amounts > 0
|
||||
3. Vendor similarity >= 80 % (stricter threshold compensates for loose amount)
|
||||
4. Times within 30 min (if both present)
|
||||
"""
|
||||
amt = float(parsed.get('amount', 0))
|
||||
date = parsed.get('date', '')
|
||||
@@ -267,35 +272,52 @@ class ExpensesAgent(BaseAgent):
|
||||
vendor = str(parsed.get('vendor', '')).lower().strip()
|
||||
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||
|
||||
def _times_compatible(t1, t2) -> bool:
|
||||
"""Return False only when both times are present and >30 min apart."""
|
||||
if not (t1 and t2):
|
||||
return True
|
||||
try:
|
||||
h1, m1 = (int(p) for p in t1.split(':')[:2])
|
||||
h2, m2 = (int(p) for p in t2.split(':')[:2])
|
||||
return abs((h1 * 60 + m1) - (h2 * 60 + m2)) <= 30
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
# Pass 1: amount must match within $0.05
|
||||
for idx, (_, other) in enumerate(candidates):
|
||||
other_amt = float(other.get('amount', 0))
|
||||
# Skip zero-amount receipts — too ambiguous to dedup
|
||||
if amt == 0 or other_amt == 0:
|
||||
continue
|
||||
if abs(amt - other_amt) > 0.05:
|
||||
continue
|
||||
if date != other.get('date', ''):
|
||||
continue
|
||||
# Time check: if both receipts have a transaction time and they are
|
||||
# more than 30 minutes apart they are different transactions.
|
||||
other_time = other.get('time')
|
||||
if time and other_time:
|
||||
try:
|
||||
h1, m1 = (int(p) for p in time.split(':')[:2])
|
||||
h2, m2 = (int(p) for p in other_time.split(':')[:2])
|
||||
if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30:
|
||||
continue
|
||||
except Exception:
|
||||
pass # unparseable time — ignore the signal
|
||||
if not _times_compatible(time, other.get('time')):
|
||||
continue
|
||||
other_vendor = str(other.get('vendor', '')).lower().strip()
|
||||
other_is_filename = other_vendor.endswith(
|
||||
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||
if is_filename or other_is_filename:
|
||||
# Same amount + date, no vendor text to compare — treat as dup
|
||||
return idx
|
||||
ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio()
|
||||
if ratio >= 0.6:
|
||||
if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.6:
|
||||
return idx
|
||||
|
||||
# Pass 2: same vendor + same date even when amounts differ (OCR misread)
|
||||
if not is_filename:
|
||||
for idx, (_, other) in enumerate(candidates):
|
||||
other_amt = float(other.get('amount', 0))
|
||||
if amt == 0 or other_amt == 0:
|
||||
continue
|
||||
if date != other.get('date', ''):
|
||||
continue
|
||||
if not _times_compatible(time, other.get('time')):
|
||||
continue
|
||||
other_vendor = str(other.get('vendor', '')).lower().strip()
|
||||
if other_vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')):
|
||||
continue
|
||||
if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.80:
|
||||
return idx
|
||||
|
||||
return None
|
||||
|
||||
async def _parse_receipt_text(self, text: str, filename: str,
|
||||
|
||||
Reference in New Issue
Block a user