From 6287b3bcef5e9ba164c28b6d5d74493994a0719d Mon Sep 17 00:00:00 2001
From: Carlos Garcia <tocmo@DESKTOP-O6U0UOS.AVC.local>
Date: Thu, 21 May 2026 00:11:03 -0400
Subject: [PATCH] fix(expenses): improve receipt amount extraction and vendor
 naming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove card brands (VISA/MC/Amex) from _SKIP_LINE_RE so card-terminal
  lines like "VISA USD$ 36.78" are no longer skipped
- Replace bottom-50% scan with full-text max scan (Pass 2): scans every
  line in the receipt and returns the largest dollar amount, correctly
  handling display-style receipts that show the charge at the top with
  no label (e.g. LAYAL CAFE $40.10 before the item list)
- Update vendor LLM prompt to ask the model to correct OCR garbling
  (e.g. "NeDonald's" → "McDonald's") and detect bank statements
- Add 4 new tests covering top-amount, card-terminal, max-beats-items,
  and change-exclusion scenarios (71 tests, all passing)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 agent_service/agents/expenses_agent.py | 45 ++++++++++++++++----------
 tests/test_expenses_agent.py           | 26 +++++++++++++--
 2 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py
index 2fd3c53..fcb5d37 100644
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -21,11 +21,11 @@ _TOTAL_RE = re.compile(
     re.IGNORECASE,
 )
 
-# Lines printed AFTER the total (change given, tip, etc.) — skip these
-# when doing the bottom-of-receipt scan so we don't mistake them for the total.
+# Lines that should never be treated as the total — change given back,
+# tip added after the fact, etc.  Card-brand lines like "VISA USD$ 36.78"
+# are intentionally NOT listed here: the amount on those lines IS the charge.
 _SKIP_LINE_RE = re.compile(
-    r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|'
-    r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
+    r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity)\b',
     re.IGNORECASE,
 )
 
@@ -41,10 +41,15 @@ def _extract_amount_from_text(text: str) -> float:
     """Return the final total from OCR receipt text, or 0.0 if not found.
 
     Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
-    Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text,
-             skipping change/cash/tip lines.  Handles cases where Tesseract
-             garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own
-             line below the label.
+    Pass 2 — full-text maximum: scan every line for a dollar amount (skipping
+             change/tip lines) and return the largest value found.  This handles:
+             • display-style receipts that show the charge at the top with no
+               label (e.g. LAYAL CAFE — "$40.10" printed before the item list)
+             • card-terminal printouts with lines like "VISA USD$ 36.78" that
+               carry no 'Total' keyword
+             The maximum heuristic works because the receipt total is always
+             ≥ any individual item price; Pass 1 (labeled total) catches the
+             rare cases where a discount makes the total less than a line item.
     """
     if not text:
         return 0.0
@@ -60,20 +65,21 @@ def _extract_amount_from_text(text: str) -> float:
         except ValueError:
             pass
 
-    # Pass 2: bottom-of-receipt line scan
-    # Only search the bottom half so item prices (middle section) are excluded
-    bottom = text[max(0, int(len(text) * 0.5)):]
-    for line in reversed(bottom.splitlines()):
+    # Pass 2: maximum dollar amount across the full text
+    best = 0.0
+    for line in text.splitlines():
         if _SKIP_LINE_RE.search(line):
             continue
         m = _ANY_DOLLAR_RE.search(line)
         if m:
             try:
                 val = float(m.group(1).replace(',', ''))
-                if val > 0:
-                    return val
+                if val > best:
+                    best = val
             except ValueError:
                 pass
+    if best > 0:
+        return best
 
     return 0.0
 
@@ -462,13 +468,18 @@ class ExpensesAgent(BaseAgent):
             excerpt = stripped[:600]
             prompt = (
                 'Return ONLY valid JSON with exactly two keys:\n'
-                '"vendor": the store or restaurant name, copied exactly from the '
-                'first 1-3 lines of the receipt. Use "" if no clear name.\n'
+                '"vendor": the merchant or store name from the receipt header. '
+                'OCR often garbles text — use your knowledge to correct obvious '
+                'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → '
+                '"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). '
+                'If this looks like a bank or credit-card statement listing '
+                'multiple transactions rather than a single merchant receipt, '
+                'use "". Use "" if no clear business name is visible.\n'
                 f'"product_name": the single best match from [{product_list}] '
                 'based on the type of business (restaurant→Meals, gas station→Fuel, '
                 'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
                 'Use "" if none fit.\n\n'
-                f'Receipt:\n{excerpt}\n\nJSON only:'
+                f'Receipt text:\n{excerpt}\n\nJSON only:'
             )
         elif product_list:
             # OCR failed — guess category from filename only
diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py
index b353d7d..b3af86e 100644
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -474,11 +474,33 @@ class TestExtractAmount:
         assert _extract_amount_from_text(text) == 5.50
 
     def test_amount_due_with_usd_suffix(self):
-        # PDF text may include "USD" after the number — regex should still work
-        # via the bottom scan since the labeled-total regex won't match "USD"
+        # "Total Charged" is in _TOTAL_RE — Pass 1 catches it
         text = 'Total Charged: $198.40 USD'
         assert _extract_amount_from_text(text) == 198.40
 
+    def test_top_amount_returned_by_max(self):
+        # Display-style receipt: charge shown at top, no 'Total' label.
+        # Pass 2 (max) must find $40.10 even though it is before the item list.
+        text = 'LAYAL CAFE\n$40.10\n--------\nBreakfast  37.30\nCoffee  2.80'
+        assert _extract_amount_from_text(text) == 40.10
+
+    def test_card_terminal_visa_line(self):
+        # Card terminal: amount on a line prefixed with card-brand text.
+        # VISA must NOT be in the skip list so the amount is captured.
+        text = 'MERCHANT XYZ\nYHOOMHXAKKKEO4S VISA USD$ 36.78\nAuth 123456'
+        assert _extract_amount_from_text(text) == 36.78
+
+    def test_max_beats_item_prices(self):
+        # Receipt with several item prices — max should return the largest
+        # (the total), not an item that appears last in the text.
+        text = 'Burger  12.99\nFries   4.50\nDrink   2.99\nT0TAL  20.48'
+        assert _extract_amount_from_text(text) == 20.48
+
+    def test_change_line_excluded_from_max(self):
+        # Change-due line must be skipped so it never inflates the max.
+        text = 'Items  8.49\nCash Tendered  20.00\nChange  11.51'
+        assert _extract_amount_from_text(text) == 8.49
+
 
 class TestExtractDate:
     def test_iso_format(self):