expenses_agent: fix OCR '$→8' misread inflating receipt totals

Add _fix_ocr_dollar_as_8() which strips a spurious leading '8' when it
sits at a word boundary before a non-zero digit + 1–3 more digits + .dd
(covers $10–$9999).  Applied at the top of _extract_amount_from_text so
both the labeled-total pass and the max-scan pass benefit.

  845.00  → 45.00   ($45 misread as 845)
  885.00  → 85.00   ($85 misread as 885)
  8150.00 → 150.00  ($150 misread as 8150)
  85.00   → 85.00   UNCHANGED (real $85 correctly read)
  8.50    → 8.50    UNCHANGED (real $8.50 correctly read)

12 new tests covering fix cases, non-fix cases, and end-to-end extraction
(110 tests total, all passing).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 16:08:39 -04:00
parent aea2fa02b8
commit beac16a6a9
2 changed files with 73 additions and 1 deletions

View File

@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
from agent_service.agents.expenses_agent import (
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
_MONTH_MAP, _get_vision_mode,
_fix_ocr_dollar_as_8, _MONTH_MAP, _get_vision_mode,
)
@@ -537,6 +537,54 @@ class TestExtractAmount:
assert _extract_amount_from_text(text) == 150.00
class TestFixOcrDollarAs8:
"""_fix_ocr_dollar_as_8 — strip spurious leading '8' OCR misread of '$'."""
def test_two_digit_amount_fixed(self):
# $45 misread as 845
assert _fix_ocr_dollar_as_8('Total: 845.00') == 'Total: 45.00'
def test_three_digit_amount_fixed(self):
# $150 misread as 8150
assert _fix_ocr_dollar_as_8('Amount: 8150.00') == 'Amount: 150.00'
def test_misread_dollar_85_fixed(self):
# $85 misread as 885 — result should be 85.00
assert _fix_ocr_dollar_as_8('Total: 885.00') == 'Total: 85.00'
def test_real_85_not_touched(self):
# Real $85 correctly read as 85.00 — 8 followed by only 1 digit → no match
assert _fix_ocr_dollar_as_8('Total: 85.00') == 'Total: 85.00'
def test_real_8_50_not_touched(self):
# Single-digit dollar amount, correctly read — unchanged
assert _fix_ocr_dollar_as_8('Price: 8.50') == 'Price: 8.50'
def test_explicit_dollar_sign_not_touched(self):
# Already has '$' — nothing to fix
assert _fix_ocr_dollar_as_8('Total: $45.00') == 'Total: $45.00'
def test_digit_before_8_not_touched(self):
# 8 is mid-number (preceded by digit) — lookbehind blocks it
assert _fix_ocr_dollar_as_8('Amount: 12845.00') == 'Amount: 12845.00'
def test_zero_leading_digit_not_touched(self):
# 8 followed by 0XX — could be real $8000; left alone
assert _fix_ocr_dollar_as_8('Total: 8045.00') == 'Total: 8045.00'
def test_extract_amount_applies_fix(self):
# Integration: extraction uses the fix internally
assert _extract_amount_from_text('TOTAL 845.00') == 45.00
def test_extract_amount_labeled_total_fixed(self):
assert _extract_amount_from_text('Grand Total: 8150.00') == 150.00
def test_extract_amount_pass2_scan_fixed(self):
# No 'Total' label — Pass 2 max scan must also see the fixed amount
text = 'BURGER 12.99\nFRIES 4.50\n845.00'
assert _extract_amount_from_text(text) == 45.00
class TestBankStatementDetection:
def _stmt(self, n: int) -> str:
"""Generate fake bank statement with n transaction lines."""