From 6287b3bcef5e9ba164c28b6d5d74493994a0719d Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Thu, 21 May 2026 00:11:03 -0400 Subject: [PATCH] fix(expenses): improve receipt amount extraction and vendor naming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove card brands (VISA/MC/Amex) from _SKIP_LINE_RE so card-terminal lines like "VISA USD$ 36.78" are no longer skipped - Replace bottom-50% scan with full-text max scan (Pass 2): scans every line in the receipt and returns the largest dollar amount, correctly handling display-style receipts that show the charge at the top with no label (e.g. LAYAL CAFE $40.10 before the item list) - Update vendor LLM prompt to ask the model to correct OCR garbling (e.g. "NeDonald's" → "McDonald's") and detect bank statements - Add 4 new tests covering top-amount, card-terminal, max-beats-items, and change-exclusion scenarios (71 tests, all passing) Co-Authored-By: Claude Sonnet 4.6 --- agent_service/agents/expenses_agent.py | 45 ++++++++++++++++---------- tests/test_expenses_agent.py | 26 +++++++++++++-- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 2fd3c53..fcb5d37 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -21,11 +21,11 @@ _TOTAL_RE = re.compile( re.IGNORECASE, ) -# Lines printed AFTER the total (change given, tip, etc.) — skip these -# when doing the bottom-of-receipt scan so we don't mistake them for the total. +# Lines that should never be treated as the total — change given back, +# tip added after the fact, etc. Card-brand lines like "VISA USD$ 36.78" +# are intentionally NOT listed here: the amount on those lines IS the charge. _SKIP_LINE_RE = re.compile( - r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|' - r'auth(?:orized)?|visa|mastercard|amex|discover)\b', + r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity)\b', re.IGNORECASE, ) @@ -41,10 +41,15 @@ def _extract_amount_from_text(text: str) -> float: """Return the final total from OCR receipt text, or 0.0 if not found. Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc. - Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text, - skipping change/cash/tip lines. Handles cases where Tesseract - garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own - line below the label. + Pass 2 — full-text maximum: scan every line for a dollar amount (skipping + change/tip lines) and return the largest value found. This handles: + • display-style receipts that show the charge at the top with no + label (e.g. LAYAL CAFE — "$40.10" printed before the item list) + • card-terminal printouts with lines like "VISA USD$ 36.78" that + carry no 'Total' keyword + The maximum heuristic works because the receipt total is always + ≥ any individual item price; Pass 1 (labeled total) catches the + rare cases where a discount makes the total less than a line item. """ if not text: return 0.0 @@ -60,20 +65,21 @@ def _extract_amount_from_text(text: str) -> float: except ValueError: pass - # Pass 2: bottom-of-receipt line scan - # Only search the bottom half so item prices (middle section) are excluded - bottom = text[max(0, int(len(text) * 0.5)):] - for line in reversed(bottom.splitlines()): + # Pass 2: maximum dollar amount across the full text + best = 0.0 + for line in text.splitlines(): if _SKIP_LINE_RE.search(line): continue m = _ANY_DOLLAR_RE.search(line) if m: try: val = float(m.group(1).replace(',', '')) - if val > 0: - return val + if val > best: + best = val except ValueError: pass + if best > 0: + return best return 0.0 @@ -462,13 +468,18 @@ class ExpensesAgent(BaseAgent): excerpt = stripped[:600] prompt = ( 'Return ONLY valid JSON with exactly two keys:\n' - '"vendor": the store or restaurant name, copied exactly from the ' - 'first 1-3 lines of the receipt. Use "" if no clear name.\n' + '"vendor": the merchant or store name from the receipt header. ' + 'OCR often garbles text — use your knowledge to correct obvious ' + 'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → ' + '"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). ' + 'If this looks like a bank or credit-card statement listing ' + 'multiple transactions rather than a single merchant receipt, ' + 'use "". Use "" if no clear business name is visible.\n' f'"product_name": the single best match from [{product_list}] ' 'based on the type of business (restaurant→Meals, gas station→Fuel, ' 'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). ' 'Use "" if none fit.\n\n' - f'Receipt:\n{excerpt}\n\nJSON only:' + f'Receipt text:\n{excerpt}\n\nJSON only:' ) elif product_list: # OCR failed — guess category from filename only diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index b353d7d..b3af86e 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -474,11 +474,33 @@ class TestExtractAmount: assert _extract_amount_from_text(text) == 5.50 def test_amount_due_with_usd_suffix(self): - # PDF text may include "USD" after the number — regex should still work - # via the bottom scan since the labeled-total regex won't match "USD" + # "Total Charged" is in _TOTAL_RE — Pass 1 catches it text = 'Total Charged: $198.40 USD' assert _extract_amount_from_text(text) == 198.40 + def test_top_amount_returned_by_max(self): + # Display-style receipt: charge shown at top, no 'Total' label. + # Pass 2 (max) must find $40.10 even though it is before the item list. + text = 'LAYAL CAFE\n$40.10\n--------\nBreakfast 37.30\nCoffee 2.80' + assert _extract_amount_from_text(text) == 40.10 + + def test_card_terminal_visa_line(self): + # Card terminal: amount on a line prefixed with card-brand text. + # VISA must NOT be in the skip list so the amount is captured. + text = 'MERCHANT XYZ\nYHOOMHXAKKKEO4S VISA USD$ 36.78\nAuth 123456' + assert _extract_amount_from_text(text) == 36.78 + + def test_max_beats_item_prices(self): + # Receipt with several item prices — max should return the largest + # (the total), not an item that appears last in the text. + text = 'Burger 12.99\nFries 4.50\nDrink 2.99\nT0TAL 20.48' + assert _extract_amount_from_text(text) == 20.48 + + def test_change_line_excluded_from_max(self): + # Change-due line must be skipped so it never inflates the max. + text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51' + assert _extract_amount_from_text(text) == 8.49 + class TestExtractDate: def test_iso_format(self):