From ece811cccb57b4cffc3601e1c99a1c465213eb7d Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Thu, 21 May 2026 00:46:08 -0400 Subject: [PATCH] fix(expenses): LAYAL CAFE $2.80 bug, United Airlines rotation & date LAYAL CAFE ($2.80 instead of $42.90): - Add (?!\s*tax) lookahead to _TOTAL_RE so "Total Taxes $2.80" is never confused with the receipt total when OCR drops the "Taxes" word - Change Pass 1 from matches[-1] to max() so the largest labeled amount always wins, regardless of line order in the OCR output United Airlines (Subway/$0/wrong date): - Add OSD-based rotation correction in receipt_parser.py: after EXIF transpose, ask Tesseract's orientation-detection engine (--psm 0) what angle to rotate; applies to receipts photographed lying sideways where EXIF metadata cannot help - Add month-name date patterns (DD MON YYYY / MON DD YYYY) to _extract_date_from_text for airline/hotel receipts that print dates like "05 MAY 2026" instead of "05/07/26" 85 tests, all passing. Co-Authored-By: Claude Sonnet 4.6 --- agent_service/agents/expenses_agent.py | 53 ++++++++++++++++++++++---- agent_service/tools/receipt_parser.py | 17 +++++++++ tests/test_expenses_agent.py | 28 ++++++++++++++ 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 2fa016a..3eca3ec 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -15,10 +15,16 @@ from ..tools.expenses_tools import ExpensesTools # Matches an explicitly labeled total line. # Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", # "Total Sale $58.75" (gas stations), "Net Sale $X", etc. +# +# The negative lookahead (?!\s*tax) prevents "Total Tax" / "Total Taxes" +# (a sub-total line present on restaurant receipts) from being confused +# with the final total when Tesseract splits a two-column label+amount +# layout across lines. _TOTAL_RE = re.compile( r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|' r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|' r'sale\s*total|you\s*paid|amount\s*paid|total)' + r'(?!\s*tax)' # exclude "Total Tax / Total Taxes" r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})', re.IGNORECASE, ) @@ -52,6 +58,18 @@ def _is_likely_bank_statement(text: str) -> bool: _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY +# "05 MAY 2026" or "MAY 05 2026" or "05 May, 2026" (airline / hotel receipts) +_DATE_MON_RE = re.compile( + r'\b(\d{1,2})\s+([A-Za-z]{3,9})[,\s]+(\d{4})\b' # DD MON YYYY + r'|\b([A-Za-z]{3,9})\s+(\d{1,2})[,\s]+(\d{4})\b', # MON DD YYYY +) +_MONTH_MAP: dict[str, int] = { + 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, + 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, + 'january': 1, 'february': 2, 'march': 3, 'april': 4, + 'june': 6, 'july': 7, 'august': 8, 'september': 9, + 'october': 10, 'november': 11, 'december': 12, +} def _extract_amount_from_text(text: str) -> float: @@ -71,16 +89,22 @@ def _extract_amount_from_text(text: str) -> float: if not text: return 0.0 - # Pass 1: explicit label match + # Pass 1: explicit label match — return the LARGEST labeled amount. + # Using max() rather than the last positional match handles the common + # OCR artefact where "Total\n$2.80" (garbled "Total Taxes") appears + # before "Total\n$42.90" in the text; the actual total wins on value. matches = list(_TOTAL_RE.finditer(text)) if matches: - raw = matches[-1].group(1).replace(',', '') - try: - val = float(raw) - if val > 0: - return val - except ValueError: - pass + best_labeled = 0.0 + for m in matches: + try: + val = float(m.group(1).replace(',', '')) + if val > best_labeled: + best_labeled = val + except ValueError: + pass + if best_labeled > 0: + return best_labeled # Pass 2: maximum dollar amount across the full text best = 0.0 @@ -121,6 +145,19 @@ def _extract_date_from_text(text: str) -> str | None: if 1 <= mo <= 12 and 1 <= d <= 31: y = 2000 + yr if yr < 50 else 1900 + yr return f'{y}-{mo:02d}-{d:02d}' + # Month-name formats: "05 MAY 2026", "MAY 05 2026", "05 May, 2026" + # Common on airline, hotel, and formal business receipts. + m = _DATE_MON_RE.search(text) + if m: + if m.group(1): # DD MON YYYY branch + d_s, mon_s, y_s = m.group(1), m.group(2), m.group(3) + else: # MON DD YYYY branch + mon_s, d_s, y_s = m.group(4), m.group(5), m.group(6) + mo = _MONTH_MAP.get(mon_s.lower()[:3]) + if mo: + d_i, y_i = int(d_s), int(y_s) + if 1 <= d_i <= 31 and 2000 <= y_i <= 2099: + return f'{y_i}-{mo:02d}-{d_i:02d}' return None logger = logging.getLogger(__name__) diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index 1598ab4..e366e95 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -100,6 +100,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str: except Exception: pass # exif_transpose requires Pillow >= 6.0 + # ── Step 1b: Content-based rotation correction ─────────────────────── + # EXIF transpose (Step 1) only corrects for phone-tilt metadata. + # If the receipt was physically laid sideways in the frame (e.g. a + # landscape receipt photographed with the phone upright), the pixels + # are genuinely rotated and EXIF can't help. Ask Tesseract's OSD + # engine to detect the text orientation and rotate to correct it. + try: + osd = pytesseract.image_to_osd(img, config='--psm 0') + _am = re.search(r'Rotate:\s*(\d+)', osd) + if _am: + _angle = int(_am.group(1)) + if _angle: + img = img.rotate(_angle, expand=True) + logger.debug('OSD: rotated %s by %d°', filename, _angle) + except Exception: + pass # OSD unavailable or not enough text — proceed without correction + # ── Step 2: Resize to working width (1800px) ────────────────────────── max_w = 1800 if img.width > max_w: diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index 86e583c..8f45a8b 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -428,6 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates(): from agent_service.agents.expenses_agent import ( _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement, + _MONTH_MAP, ) @@ -473,6 +474,19 @@ class TestExtractAmount: text = 'Items 5.00\nTax 0.50\nTotal\n5.50' assert _extract_amount_from_text(text) == 5.50 + def test_total_taxes_excluded(self): + # "Total Taxes $2.80" must NOT be confused with the receipt total; + # the labeled-total regex excludes 'total tax/taxes' via lookahead. + text = 'Subtotal $40.10\nTotal Taxes $2.80\nTotal $42.90' + assert _extract_amount_from_text(text) == 42.90 + + def test_pass1_returns_max_not_last(self): + # If OCR garbles "Total Taxes" into "Total\n$2.80", _TOTAL_RE would + # accidentally match twice. max() must win over positional [-1]. + # Simulate by giving two labeled totals where smaller appears second. + text = 'Grand Total $42.90\nTotal $2.80' + assert _extract_amount_from_text(text) == 42.90 + def test_total_sale_gas_station(self): # Costco / Shell gas receipts say "Total Sale $X.XX", not "Total: $X.XX" text = 'Pump 9 16.189 Gal\nRegular $ 58.75\nTotal Sale $ 58.75' @@ -566,6 +580,20 @@ class TestExtractDate: def test_us_short_year(self): assert _extract_date_from_text('05/09/26') == '2026-05-09' + def test_dd_mon_yyyy(self): + # Airline / hotel receipts: "05 MAY 2026", "Issue Date: 05 May 2026" + assert _extract_date_from_text('Issue Date: 05 MAY 2026 MIA A70') == '2026-05-05' + + def test_mon_dd_yyyy(self): + assert _extract_date_from_text('MAY 05 2026') == '2026-05-05' + + def test_mon_dd_comma_yyyy(self): + assert _extract_date_from_text('May 5, 2026') == '2026-05-05' + + def test_month_map_completeness(self): + # All twelve three-letter abbreviations must be present + assert len({k for k in _MONTH_MAP if len(k) == 3}) == 12 + def test_no_date(self): assert _extract_date_from_text('No date here') is None