From 12576ead1b8bac03dfd8b6d78bb0876e937a3dd5 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Sat, 16 May 2026 16:48:51 -0400 Subject: [PATCH] feat: two-pass dedup catches same-vendor OCR amount misreads Pass 1 unchanged: same date + amount within 0.05 + vendor similarity 60%. Pass 2 (new): same vendor (>= 80% similarity) + same date, regardless of amount, to catch receipts where OCR misread the total. Co-Authored-By: Claude Sonnet 4.6 --- agent_service/agents/expenses_agent.py | 60 ++++++++++++++++++-------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 09fb2d0..53756d6 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -254,12 +254,17 @@ class ExpensesAgent(BaseAgent): Return the index in `candidates` of a receipt that appears to be the same physical receipt as `parsed`, or None if no match found. - Match criteria (all must pass): + Pass 1 — exact-amount match (all must pass): 1. Same date 2. Amount > 0 and within $0.05 of each other - 3. Transaction times within 30 min of each other (if both present); - times > 30 min apart rule out a duplicate - 4. Vendor name similarity >= 60 % (or both vendors are raw filenames) + 3. Transaction times within 30 min (if both present) + 4. Vendor similarity >= 60 % (or both vendors are raw filenames) + + Pass 2 — OCR-error match (amount may differ due to misread): + 1. Same date + 2. Both amounts > 0 + 3. Vendor similarity >= 80 % (stricter threshold compensates for loose amount) + 4. Times within 30 min (if both present) """ amt = float(parsed.get('amount', 0)) date = parsed.get('date', '') @@ -267,35 +272,52 @@ class ExpensesAgent(BaseAgent): vendor = str(parsed.get('vendor', '')).lower().strip() is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) + def _times_compatible(t1, t2) -> bool: + """Return False only when both times are present and >30 min apart.""" + if not (t1 and t2): + return True + try: + h1, m1 = (int(p) for p in t1.split(':')[:2]) + h2, m2 = (int(p) for p in t2.split(':')[:2]) + return abs((h1 * 60 + m1) - (h2 * 60 + m2)) <= 30 + except Exception: + return True + + # Pass 1: amount must match within $0.05 for idx, (_, other) in enumerate(candidates): other_amt = float(other.get('amount', 0)) - # Skip zero-amount receipts — too ambiguous to dedup if amt == 0 or other_amt == 0: continue if abs(amt - other_amt) > 0.05: continue if date != other.get('date', ''): continue - # Time check: if both receipts have a transaction time and they are - # more than 30 minutes apart they are different transactions. - other_time = other.get('time') - if time and other_time: - try: - h1, m1 = (int(p) for p in time.split(':')[:2]) - h2, m2 = (int(p) for p in other_time.split(':')[:2]) - if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30: - continue - except Exception: - pass # unparseable time — ignore the signal + if not _times_compatible(time, other.get('time')): + continue other_vendor = str(other.get('vendor', '')).lower().strip() other_is_filename = other_vendor.endswith( ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) if is_filename or other_is_filename: - # Same amount + date, no vendor text to compare — treat as dup return idx - ratio = difflib.SequenceMatcher(None, vendor, other_vendor).ratio() - if ratio >= 0.6: + if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.6: return idx + + # Pass 2: same vendor + same date even when amounts differ (OCR misread) + if not is_filename: + for idx, (_, other) in enumerate(candidates): + other_amt = float(other.get('amount', 0)) + if amt == 0 or other_amt == 0: + continue + if date != other.get('date', ''): + continue + if not _times_compatible(time, other.get('time')): + continue + other_vendor = str(other.get('vendor', '')).lower().strip() + if other_vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')): + continue + if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.80: + return idx + return None async def _parse_receipt_text(self, text: str, filename: str,