diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 387c2f4..0f870e8 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -29,6 +29,27 @@ _TOTAL_RE = re.compile( re.IGNORECASE, ) +# OCR artefact: the '$' glyph is often misclassified as '8', turning +# 'Total: $45.00' into 'Total: 845.00'. We strip the spurious leading '8' +# when it sits at a word boundary and is followed by a non-zero digit then +# 1-3 more digits + two decimal places. This covers the $10–$9999 range. +# +# 845.00 → 45.00 (was $45, OCR gave 845) +# 885.00 → 85.00 (was $85, OCR gave 885) +# 8150.00 → 150.00 (was $150, OCR gave 8150) +# 85.00 → 85.00 UNCHANGED — real $85 correctly read +# 8.50 → 8.50 UNCHANGED — real $8.50 correctly read +# 12845.00→ 12845.00 UNCHANGED — digit before the 8 blocks lookbehind +# Edge case: a real $8xx amount correctly read (e.g. 840.00) may be reduced +# to $40; this is rare compared to the misread and obvious on human review. +_OCR_DOLLAR_MISREAD_RE = re.compile(r'(? str: + """Strip a spurious leading '8' that is an OCR misread of '$'.""" + return _OCR_DOLLAR_MISREAD_RE.sub(r'\1', text) + + # Lines that should never be treated as the total — change given back, # tip added after the fact, etc. Card-brand lines like "VISA USD$ 36.78" # are intentionally NOT listed here: the amount on those lines IS the charge. @@ -109,6 +130,9 @@ def _extract_amount_from_text(text: str) -> float: if not text: return 0.0 + # Normalise '$→8' OCR misread before any pattern matching. + text = _fix_ocr_dollar_as_8(text) + # Pass 1: explicit label match — return the LARGEST labeled amount. # Using max() rather than the last positional match handles the common # OCR artefact where "Total\n$2.80" (garbled "Total Taxes") appears diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index 58d7e23..1ab11b3 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates(): from agent_service.agents.expenses_agent import ( _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement, - _MONTH_MAP, _get_vision_mode, + _fix_ocr_dollar_as_8, _MONTH_MAP, _get_vision_mode, ) @@ -537,6 +537,54 @@ class TestExtractAmount: assert _extract_amount_from_text(text) == 150.00 +class TestFixOcrDollarAs8: + """_fix_ocr_dollar_as_8 — strip spurious leading '8' OCR misread of '$'.""" + + def test_two_digit_amount_fixed(self): + # $45 misread as 845 + assert _fix_ocr_dollar_as_8('Total: 845.00') == 'Total: 45.00' + + def test_three_digit_amount_fixed(self): + # $150 misread as 8150 + assert _fix_ocr_dollar_as_8('Amount: 8150.00') == 'Amount: 150.00' + + def test_misread_dollar_85_fixed(self): + # $85 misread as 885 — result should be 85.00 + assert _fix_ocr_dollar_as_8('Total: 885.00') == 'Total: 85.00' + + def test_real_85_not_touched(self): + # Real $85 correctly read as 85.00 — 8 followed by only 1 digit → no match + assert _fix_ocr_dollar_as_8('Total: 85.00') == 'Total: 85.00' + + def test_real_8_50_not_touched(self): + # Single-digit dollar amount, correctly read — unchanged + assert _fix_ocr_dollar_as_8('Price: 8.50') == 'Price: 8.50' + + def test_explicit_dollar_sign_not_touched(self): + # Already has '$' — nothing to fix + assert _fix_ocr_dollar_as_8('Total: $45.00') == 'Total: $45.00' + + def test_digit_before_8_not_touched(self): + # 8 is mid-number (preceded by digit) — lookbehind blocks it + assert _fix_ocr_dollar_as_8('Amount: 12845.00') == 'Amount: 12845.00' + + def test_zero_leading_digit_not_touched(self): + # 8 followed by 0XX — could be real $8000; left alone + assert _fix_ocr_dollar_as_8('Total: 8045.00') == 'Total: 8045.00' + + def test_extract_amount_applies_fix(self): + # Integration: extraction uses the fix internally + assert _extract_amount_from_text('TOTAL 845.00') == 45.00 + + def test_extract_amount_labeled_total_fixed(self): + assert _extract_amount_from_text('Grand Total: 8150.00') == 150.00 + + def test_extract_amount_pass2_scan_fixed(self): + # No 'Total' label — Pass 2 max scan must also see the fixed amount + text = 'BURGER 12.99\nFRIES 4.50\n845.00' + assert _extract_amount_from_text(text) == 45.00 + + class TestBankStatementDetection: def _stmt(self, n: int) -> str: """Generate fake bank statement with n transaction lines."""