fix(expenses): improve receipt amount extraction and vendor naming

- Remove card brands (VISA/MC/Amex) from _SKIP_LINE_RE so card-terminal
  lines like "VISA USD$ 36.78" are no longer skipped
- Replace bottom-50% scan with full-text max scan (Pass 2): scans every
  line in the receipt and returns the largest dollar amount, correctly
  handling display-style receipts that show the charge at the top with
  no label (e.g. LAYAL CAFE $40.10 before the item list)
- Update vendor LLM prompt to ask the model to correct OCR garbling
  (e.g. "NeDonald's" → "McDonald's") and detect bank statements
- Add 4 new tests covering top-amount, card-terminal, max-beats-items,
  and change-exclusion scenarios (71 tests, all passing)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 00:11:03 -04:00
parent 1536d83376
commit 6287b3bcef
2 changed files with 52 additions and 19 deletions

View File

@@ -21,11 +21,11 @@ _TOTAL_RE = re.compile(
re.IGNORECASE, re.IGNORECASE,
) )
# Lines printed AFTER the total (change given, tip, etc.) — skip these # Lines that should never be treated as the total change given back,
# when doing the bottom-of-receipt scan so we don't mistake them for the total. # tip added after the fact, etc. Card-brand lines like "VISA USD$ 36.78"
# are intentionally NOT listed here: the amount on those lines IS the charge.
_SKIP_LINE_RE = re.compile( _SKIP_LINE_RE = re.compile(
r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|' r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity)\b',
r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
re.IGNORECASE, re.IGNORECASE,
) )
@@ -41,10 +41,15 @@ def _extract_amount_from_text(text: str) -> float:
"""Return the final total from OCR receipt text, or 0.0 if not found. """Return the final total from OCR receipt text, or 0.0 if not found.
Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc. Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text, Pass 2 — full-text maximum: scan every line for a dollar amount (skipping
skipping change/cash/tip lines. Handles cases where Tesseract change/tip lines) and return the largest value found. This handles:
garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own • display-style receipts that show the charge at the top with no
line below the label. label (e.g. LAYAL CAFE — "$40.10" printed before the item list)
• card-terminal printouts with lines like "VISA USD$ 36.78" that
carry no 'Total' keyword
The maximum heuristic works because the receipt total is always
≥ any individual item price; Pass 1 (labeled total) catches the
rare cases where a discount makes the total less than a line item.
""" """
if not text: if not text:
return 0.0 return 0.0
@@ -60,20 +65,21 @@ def _extract_amount_from_text(text: str) -> float:
except ValueError: except ValueError:
pass pass
# Pass 2: bottom-of-receipt line scan # Pass 2: maximum dollar amount across the full text
# Only search the bottom half so item prices (middle section) are excluded best = 0.0
bottom = text[max(0, int(len(text) * 0.5)):] for line in text.splitlines():
for line in reversed(bottom.splitlines()):
if _SKIP_LINE_RE.search(line): if _SKIP_LINE_RE.search(line):
continue continue
m = _ANY_DOLLAR_RE.search(line) m = _ANY_DOLLAR_RE.search(line)
if m: if m:
try: try:
val = float(m.group(1).replace(',', '')) val = float(m.group(1).replace(',', ''))
if val > 0: if val > best:
return val best = val
except ValueError: except ValueError:
pass pass
if best > 0:
return best
return 0.0 return 0.0
@@ -462,13 +468,18 @@ class ExpensesAgent(BaseAgent):
excerpt = stripped[:600] excerpt = stripped[:600]
prompt = ( prompt = (
'Return ONLY valid JSON with exactly two keys:\n' 'Return ONLY valid JSON with exactly two keys:\n'
'"vendor": the store or restaurant name, copied exactly from the ' '"vendor": the merchant or store name from the receipt header. '
'first 1-3 lines of the receipt. Use "" if no clear name.\n' 'OCR often garbles text — use your knowledge to correct obvious '
'errors (e.g. "NeDonald\'s""McDonald\'s", "TN-N-QUT"'
'"IN-N-OUT Burger", "Subwey""Subway", "LRYAL""LAYAL"). '
'If this looks like a bank or credit-card statement listing '
'multiple transactions rather than a single merchant receipt, '
'use "". Use "" if no clear business name is visible.\n'
f'"product_name": the single best match from [{product_list}] ' f'"product_name": the single best match from [{product_list}] '
'based on the type of business (restaurant→Meals, gas station→Fuel, ' 'based on the type of business (restaurant→Meals, gas station→Fuel, '
'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). ' 'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
'Use "" if none fit.\n\n' 'Use "" if none fit.\n\n'
f'Receipt:\n{excerpt}\n\nJSON only:' f'Receipt text:\n{excerpt}\n\nJSON only:'
) )
elif product_list: elif product_list:
# OCR failed — guess category from filename only # OCR failed — guess category from filename only

View File

@@ -474,11 +474,33 @@ class TestExtractAmount:
assert _extract_amount_from_text(text) == 5.50 assert _extract_amount_from_text(text) == 5.50
def test_amount_due_with_usd_suffix(self): def test_amount_due_with_usd_suffix(self):
# PDF text may include "USD" after the number — regex should still work # "Total Charged" is in _TOTAL_RE — Pass 1 catches it
# via the bottom scan since the labeled-total regex won't match "USD"
text = 'Total Charged: $198.40 USD' text = 'Total Charged: $198.40 USD'
assert _extract_amount_from_text(text) == 198.40 assert _extract_amount_from_text(text) == 198.40
def test_top_amount_returned_by_max(self):
# Display-style receipt: charge shown at top, no 'Total' label.
# Pass 2 (max) must find $40.10 even though it is before the item list.
text = 'LAYAL CAFE\n$40.10\n--------\nBreakfast 37.30\nCoffee 2.80'
assert _extract_amount_from_text(text) == 40.10
def test_card_terminal_visa_line(self):
# Card terminal: amount on a line prefixed with card-brand text.
# VISA must NOT be in the skip list so the amount is captured.
text = 'MERCHANT XYZ\nYHOOMHXAKKKEO4S VISA USD$ 36.78\nAuth 123456'
assert _extract_amount_from_text(text) == 36.78
def test_max_beats_item_prices(self):
# Receipt with several item prices — max should return the largest
# (the total), not an item that appears last in the text.
text = 'Burger 12.99\nFries 4.50\nDrink 2.99\nT0TAL 20.48'
assert _extract_amount_from_text(text) == 20.48
def test_change_line_excluded_from_max(self):
# Change-due line must be skipped so it never inflates the max.
text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51'
assert _extract_amount_from_text(text) == 8.49
class TestExtractDate: class TestExtractDate:
def test_iso_format(self): def test_iso_format(self):