fix(expenses): improve receipt amount extraction and vendor naming
- Remove card brands (VISA/MC/Amex) from _SKIP_LINE_RE so card-terminal lines like "VISA USD$ 36.78" are no longer skipped - Replace bottom-50% scan with full-text max scan (Pass 2): scans every line in the receipt and returns the largest dollar amount, correctly handling display-style receipts that show the charge at the top with no label (e.g. LAYAL CAFE $40.10 before the item list) - Update vendor LLM prompt to ask the model to correct OCR garbling (e.g. "NeDonald's" → "McDonald's") and detect bank statements - Add 4 new tests covering top-amount, card-terminal, max-beats-items, and change-exclusion scenarios (71 tests, all passing) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,11 +21,11 @@ _TOTAL_RE = re.compile(
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Lines printed AFTER the total (change given, tip, etc.) — skip these
|
||||
# when doing the bottom-of-receipt scan so we don't mistake them for the total.
|
||||
# Lines that should never be treated as the total — change given back,
|
||||
# tip added after the fact, etc. Card-brand lines like "VISA USD$ 36.78"
|
||||
# are intentionally NOT listed here: the amount on those lines IS the charge.
|
||||
_SKIP_LINE_RE = re.compile(
|
||||
r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|'
|
||||
r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
|
||||
r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity)\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
@@ -41,10 +41,15 @@ def _extract_amount_from_text(text: str) -> float:
|
||||
"""Return the final total from OCR receipt text, or 0.0 if not found.
|
||||
|
||||
Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
|
||||
Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text,
|
||||
skipping change/cash/tip lines. Handles cases where Tesseract
|
||||
garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own
|
||||
line below the label.
|
||||
Pass 2 — full-text maximum: scan every line for a dollar amount (skipping
|
||||
change/tip lines) and return the largest value found. This handles:
|
||||
• display-style receipts that show the charge at the top with no
|
||||
label (e.g. LAYAL CAFE — "$40.10" printed before the item list)
|
||||
• card-terminal printouts with lines like "VISA USD$ 36.78" that
|
||||
carry no 'Total' keyword
|
||||
The maximum heuristic works because the receipt total is always
|
||||
≥ any individual item price; Pass 1 (labeled total) catches the
|
||||
rare cases where a discount makes the total less than a line item.
|
||||
"""
|
||||
if not text:
|
||||
return 0.0
|
||||
@@ -60,20 +65,21 @@ def _extract_amount_from_text(text: str) -> float:
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pass 2: bottom-of-receipt line scan
|
||||
# Only search the bottom half so item prices (middle section) are excluded
|
||||
bottom = text[max(0, int(len(text) * 0.5)):]
|
||||
for line in reversed(bottom.splitlines()):
|
||||
# Pass 2: maximum dollar amount across the full text
|
||||
best = 0.0
|
||||
for line in text.splitlines():
|
||||
if _SKIP_LINE_RE.search(line):
|
||||
continue
|
||||
m = _ANY_DOLLAR_RE.search(line)
|
||||
if m:
|
||||
try:
|
||||
val = float(m.group(1).replace(',', ''))
|
||||
if val > 0:
|
||||
return val
|
||||
if val > best:
|
||||
best = val
|
||||
except ValueError:
|
||||
pass
|
||||
if best > 0:
|
||||
return best
|
||||
|
||||
return 0.0
|
||||
|
||||
@@ -462,13 +468,18 @@ class ExpensesAgent(BaseAgent):
|
||||
excerpt = stripped[:600]
|
||||
prompt = (
|
||||
'Return ONLY valid JSON with exactly two keys:\n'
|
||||
'"vendor": the store or restaurant name, copied exactly from the '
|
||||
'first 1-3 lines of the receipt. Use "" if no clear name.\n'
|
||||
'"vendor": the merchant or store name from the receipt header. '
|
||||
'OCR often garbles text — use your knowledge to correct obvious '
|
||||
'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → '
|
||||
'"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). '
|
||||
'If this looks like a bank or credit-card statement listing '
|
||||
'multiple transactions rather than a single merchant receipt, '
|
||||
'use "". Use "" if no clear business name is visible.\n'
|
||||
f'"product_name": the single best match from [{product_list}] '
|
||||
'based on the type of business (restaurant→Meals, gas station→Fuel, '
|
||||
'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
|
||||
'Use "" if none fit.\n\n'
|
||||
f'Receipt:\n{excerpt}\n\nJSON only:'
|
||||
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
||||
)
|
||||
elif product_list:
|
||||
# OCR failed — guess category from filename only
|
||||
|
||||
@@ -474,11 +474,33 @@ class TestExtractAmount:
|
||||
assert _extract_amount_from_text(text) == 5.50
|
||||
|
||||
def test_amount_due_with_usd_suffix(self):
|
||||
# PDF text may include "USD" after the number — regex should still work
|
||||
# via the bottom scan since the labeled-total regex won't match "USD"
|
||||
# "Total Charged" is in _TOTAL_RE — Pass 1 catches it
|
||||
text = 'Total Charged: $198.40 USD'
|
||||
assert _extract_amount_from_text(text) == 198.40
|
||||
|
||||
def test_top_amount_returned_by_max(self):
|
||||
# Display-style receipt: charge shown at top, no 'Total' label.
|
||||
# Pass 2 (max) must find $40.10 even though it is before the item list.
|
||||
text = 'LAYAL CAFE\n$40.10\n--------\nBreakfast 37.30\nCoffee 2.80'
|
||||
assert _extract_amount_from_text(text) == 40.10
|
||||
|
||||
def test_card_terminal_visa_line(self):
|
||||
# Card terminal: amount on a line prefixed with card-brand text.
|
||||
# VISA must NOT be in the skip list so the amount is captured.
|
||||
text = 'MERCHANT XYZ\nYHOOMHXAKKKEO4S VISA USD$ 36.78\nAuth 123456'
|
||||
assert _extract_amount_from_text(text) == 36.78
|
||||
|
||||
def test_max_beats_item_prices(self):
|
||||
# Receipt with several item prices — max should return the largest
|
||||
# (the total), not an item that appears last in the text.
|
||||
text = 'Burger 12.99\nFries 4.50\nDrink 2.99\nT0TAL 20.48'
|
||||
assert _extract_amount_from_text(text) == 20.48
|
||||
|
||||
def test_change_line_excluded_from_max(self):
|
||||
# Change-due line must be skipped so it never inflates the max.
|
||||
text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51'
|
||||
assert _extract_amount_from_text(text) == 8.49
|
||||
|
||||
|
||||
class TestExtractDate:
|
||||
def test_iso_format(self):
|
||||
|
||||
Reference in New Issue
Block a user