fix(expenses): improve receipt amount extraction and vendor naming

- Remove card brands (VISA/MC/Amex) from _SKIP_LINE_RE so card-terminal lines like "VISA USD$ 36.78" are no longer skipped - Replace bottom-50% scan with full-text max scan (Pass 2): scans every line in the receipt and returns the largest dollar amount, correctly handling display-style receipts that show the charge at the top with no label (e.g. LAYAL CAFE $40.10 before the item list) - Update vendor LLM prompt to ask the model to correct OCR garbling (e.g. "NeDonald's" → "McDonald's") and detect bank statements - Add 4 new tests covering top-amount, card-terminal, max-beats-items, and change-exclusion scenarios (71 tests, all passing) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 00:11:03 -04:00
parent 1536d83376
commit 6287b3bcef
2 changed files with 52 additions and 19 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -21,11 +21,11 @@ _TOTAL_RE = re.compile(
    re.IGNORECASE,
 )
-# Lines printed AFTER the total (change given, tip, etc.) — skip these
+# Lines that should never be treated as the total — change given back,
-# when doing the bottom-of-receipt scan so we don't mistake them for the total.
+# tip added after the fact, etc.  Card-brand lines like "VISA USD$ 36.78"
 # are intentionally NOT listed here: the amount on those lines IS the charge.
 _SKIP_LINE_RE = re.compile(
-    r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|'
+    r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity)\b',
    r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
    re.IGNORECASE,
 )
@@ -41,10 +41,15 @@ def _extract_amount_from_text(text: str) -> float:
    """Return the final total from OCR receipt text, or 0.0 if not found.
    Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
-    Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text,
+    Pass 2 — full-text maximum: scan every line for a dollar amount (skipping
-             skipping change/cash/tip lines.  Handles cases where Tesseract
+             change/tip lines) and return the largest value found.  This handles:
-             garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own
+             • display-style receipts that show the charge at the top with no
-             line below the label.
+               label (e.g. LAYAL CAFE — "$40.10" printed before the item list)
             • card-terminal printouts with lines like "VISA USD$ 36.78" that
               carry no 'Total' keyword
             The maximum heuristic works because the receipt total is always
             ≥ any individual item price; Pass 1 (labeled total) catches the
             rare cases where a discount makes the total less than a line item.
    """
    if not text:
        return 0.0
@@ -60,20 +65,21 @@ def _extract_amount_from_text(text: str) -> float:
        except ValueError:
            pass
-    # Pass 2: bottom-of-receipt line scan
+    # Pass 2: maximum dollar amount across the full text
-    # Only search the bottom half so item prices (middle section) are excluded
+    best = 0.0
-    bottom = text[max(0, int(len(text) * 0.5)):]
+    for line in text.splitlines():
    for line in reversed(bottom.splitlines()):
        if _SKIP_LINE_RE.search(line):
            continue
        m = _ANY_DOLLAR_RE.search(line)
        if m:
            try:
                val = float(m.group(1).replace(',', ''))
-                if val > 0:
+                if val > best:
-                    return val
+                    best = val
            except ValueError:
                pass
    if best > 0:
        return best
    return 0.0
@@ -462,13 +468,18 @@ class ExpensesAgent(BaseAgent):
            excerpt = stripped[:600]
            prompt = (
                'Return ONLY valid JSON with exactly two keys:\n'
-                '"vendor": the store or restaurant name, copied exactly from the '
+                '"vendor": the merchant or store name from the receipt header. '
-                'first 1-3 lines of the receipt. Use "" if no clear name.\n'
+                'OCR often garbles text — use your knowledge to correct obvious '
                'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → '
                '"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). '
                'If this looks like a bank or credit-card statement listing '
                'multiple transactions rather than a single merchant receipt, '
                'use "". Use "" if no clear business name is visible.\n'
                f'"product_name": the single best match from [{product_list}] '
                'based on the type of business (restaurant→Meals, gas station→Fuel, '
                'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
                'Use "" if none fit.\n\n'
-                f'Receipt:\n{excerpt}\n\nJSON only:'
+                f'Receipt text:\n{excerpt}\n\nJSON only:'
            )
        elif product_list:
            # OCR failed — guess category from filename only
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -474,11 +474,33 @@ class TestExtractAmount:
        assert _extract_amount_from_text(text) == 5.50
    def test_amount_due_with_usd_suffix(self):
-        # PDF text may include "USD" after the number — regex should still work
+        # "Total Charged" is in _TOTAL_RE — Pass 1 catches it
        # via the bottom scan since the labeled-total regex won't match "USD"
        text = 'Total Charged: $198.40 USD'
        assert _extract_amount_from_text(text) == 198.40
    def test_top_amount_returned_by_max(self):
        # Display-style receipt: charge shown at top, no 'Total' label.
        # Pass 2 (max) must find $40.10 even though it is before the item list.
        text = 'LAYAL CAFE\n$40.10\n--------\nBreakfast  37.30\nCoffee  2.80'
        assert _extract_amount_from_text(text) == 40.10
    def test_card_terminal_visa_line(self):
        # Card terminal: amount on a line prefixed with card-brand text.
        # VISA must NOT be in the skip list so the amount is captured.
        text = 'MERCHANT XYZ\nYHOOMHXAKKKEO4S VISA USD$ 36.78\nAuth 123456'
        assert _extract_amount_from_text(text) == 36.78
    def test_max_beats_item_prices(self):
        # Receipt with several item prices — max should return the largest
        # (the total), not an item that appears last in the text.
        text = 'Burger  12.99\nFries   4.50\nDrink   2.99\nT0TAL  20.48'
        assert _extract_amount_from_text(text) == 20.48
    def test_change_line_excluded_from_max(self):
        # Change-due line must be skipped so it never inflates the max.
        text = 'Items  8.49\nCash Tendered  20.00\nChange  11.51'
        assert _extract_amount_from_text(text) == 8.49
 class TestExtractDate:
    def test_iso_format(self):