fix(expenses): improve receipt amount extraction and vendor naming
- Remove card brands (VISA/MC/Amex) from _SKIP_LINE_RE so card-terminal lines like "VISA USD$ 36.78" are no longer skipped - Replace bottom-50% scan with full-text max scan (Pass 2): scans every line in the receipt and returns the largest dollar amount, correctly handling display-style receipts that show the charge at the top with no label (e.g. LAYAL CAFE $40.10 before the item list) - Update vendor LLM prompt to ask the model to correct OCR garbling (e.g. "NeDonald's" → "McDonald's") and detect bank statements - Add 4 new tests covering top-amount, card-terminal, max-beats-items, and change-exclusion scenarios (71 tests, all passing) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,11 +21,11 @@ _TOTAL_RE = re.compile(
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Lines printed AFTER the total (change given, tip, etc.) — skip these
|
||||
# when doing the bottom-of-receipt scan so we don't mistake them for the total.
|
||||
# Lines that should never be treated as the total — change given back,
|
||||
# tip added after the fact, etc. Card-brand lines like "VISA USD$ 36.78"
|
||||
# are intentionally NOT listed here: the amount on those lines IS the charge.
|
||||
_SKIP_LINE_RE = re.compile(
|
||||
r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|'
|
||||
r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
|
||||
r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity)\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
@@ -41,10 +41,15 @@ def _extract_amount_from_text(text: str) -> float:
|
||||
"""Return the final total from OCR receipt text, or 0.0 if not found.
|
||||
|
||||
Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
|
||||
Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text,
|
||||
skipping change/cash/tip lines. Handles cases where Tesseract
|
||||
garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own
|
||||
line below the label.
|
||||
Pass 2 — full-text maximum: scan every line for a dollar amount (skipping
|
||||
change/tip lines) and return the largest value found. This handles:
|
||||
• display-style receipts that show the charge at the top with no
|
||||
label (e.g. LAYAL CAFE — "$40.10" printed before the item list)
|
||||
• card-terminal printouts with lines like "VISA USD$ 36.78" that
|
||||
carry no 'Total' keyword
|
||||
The maximum heuristic works because the receipt total is always
|
||||
≥ any individual item price; Pass 1 (labeled total) catches the
|
||||
rare cases where a discount makes the total less than a line item.
|
||||
"""
|
||||
if not text:
|
||||
return 0.0
|
||||
@@ -60,20 +65,21 @@ def _extract_amount_from_text(text: str) -> float:
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pass 2: bottom-of-receipt line scan
|
||||
# Only search the bottom half so item prices (middle section) are excluded
|
||||
bottom = text[max(0, int(len(text) * 0.5)):]
|
||||
for line in reversed(bottom.splitlines()):
|
||||
# Pass 2: maximum dollar amount across the full text
|
||||
best = 0.0
|
||||
for line in text.splitlines():
|
||||
if _SKIP_LINE_RE.search(line):
|
||||
continue
|
||||
m = _ANY_DOLLAR_RE.search(line)
|
||||
if m:
|
||||
try:
|
||||
val = float(m.group(1).replace(',', ''))
|
||||
if val > 0:
|
||||
return val
|
||||
if val > best:
|
||||
best = val
|
||||
except ValueError:
|
||||
pass
|
||||
if best > 0:
|
||||
return best
|
||||
|
||||
return 0.0
|
||||
|
||||
@@ -462,13 +468,18 @@ class ExpensesAgent(BaseAgent):
|
||||
excerpt = stripped[:600]
|
||||
prompt = (
|
||||
'Return ONLY valid JSON with exactly two keys:\n'
|
||||
'"vendor": the store or restaurant name, copied exactly from the '
|
||||
'first 1-3 lines of the receipt. Use "" if no clear name.\n'
|
||||
'"vendor": the merchant or store name from the receipt header. '
|
||||
'OCR often garbles text — use your knowledge to correct obvious '
|
||||
'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → '
|
||||
'"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). '
|
||||
'If this looks like a bank or credit-card statement listing '
|
||||
'multiple transactions rather than a single merchant receipt, '
|
||||
'use "". Use "" if no clear business name is visible.\n'
|
||||
f'"product_name": the single best match from [{product_list}] '
|
||||
'based on the type of business (restaurant→Meals, gas station→Fuel, '
|
||||
'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
|
||||
'Use "" if none fit.\n\n'
|
||||
f'Receipt:\n{excerpt}\n\nJSON only:'
|
||||
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
||||
)
|
||||
elif product_list:
|
||||
# OCR failed — guess category from filename only
|
||||
|
||||
Reference in New Issue
Block a user