fix(expenses): LAYAL CAFE $2.80 bug, United Airlines rotation & date

LAYAL CAFE ($2.80 instead of $42.90): - Add (?!\s*tax) lookahead to _TOTAL_RE so "Total Taxes $2.80" is never confused with the receipt total when OCR drops the "Taxes" word - Change Pass 1 from matches[-1] to max() so the largest labeled amount always wins, regardless of line order in the OCR output United Airlines (Subway/$0/wrong date): - Add OSD-based rotation correction in receipt_parser.py: after EXIF transpose, ask Tesseract's orientation-detection engine (--psm 0) what angle to rotate; applies to receipts photographed lying sideways where EXIF metadata cannot help - Add month-name date patterns (DD MON YYYY / MON DD YYYY) to _extract_date_from_text for airline/hotel receipts that print dates like "05 MAY 2026" instead of "05/07/26" 85 tests, all passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 00:46:08 -04:00
parent ce57d19528
commit ece811cccb
3 changed files with 90 additions and 8 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -15,10 +15,16 @@ from ..tools.expenses_tools import ExpensesTools
 # Matches an explicitly labeled total line.
 # Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46",
 # "Total Sale $58.75" (gas stations), "Net Sale $X", etc.
+#
+# The negative lookahead (?!\s*tax) prevents "Total Tax" / "Total Taxes"
+# (a sub-total line present on restaurant receipts) from being confused
+# with the final total when Tesseract splits a two-column label+amount
+# layout across lines.
 _TOTAL_RE = re.compile(
    r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
    r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
    r'sale\s*total|you\s*paid|amount\s*paid|total)'
+    r'(?!\s*tax)'                       # exclude "Total Tax / Total Taxes"
    r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
    re.IGNORECASE,
 )
@@ -52,6 +58,18 @@ def _is_likely_bank_statement(text: str) -> bool:
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY
+# "05 MAY 2026"  or  "MAY 05 2026"  or  "05 May, 2026" (airline / hotel receipts)
+_DATE_MON_RE = re.compile(
+    r'\b(\d{1,2})\s+([A-Za-z]{3,9})[,\s]+(\d{4})\b'   # DD MON YYYY
+    r'|\b([A-Za-z]{3,9})\s+(\d{1,2})[,\s]+(\d{4})\b',  # MON DD YYYY
+)
+_MONTH_MAP: dict[str, int] = {
+    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
+    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
+    'january': 1, 'february': 2, 'march': 3, 'april': 4,
+    'june': 6, 'july': 7, 'august': 8, 'september': 9,
+    'october': 10, 'november': 11, 'december': 12,
+}


 def _extract_amount_from_text(text: str) -> float:
@@ -71,16 +89,22 @@ def _extract_amount_from_text(text: str) -> float:
    if not text:
        return 0.0

-    # Pass 1: explicit label match
+    # Pass 1: explicit label match — return the LARGEST labeled amount.
+    # Using max() rather than the last positional match handles the common
+    # OCR artefact where "Total\n$2.80" (garbled "Total Taxes") appears
+    # before "Total\n$42.90" in the text; the actual total wins on value.
    matches = list(_TOTAL_RE.finditer(text))
    if matches:
-        raw = matches[-1].group(1).replace(',', '')
-        try:
-            val = float(raw)
-            if val > 0:
-                return val
-        except ValueError:
-            pass
+        best_labeled = 0.0
+        for m in matches:
+            try:
+                val = float(m.group(1).replace(',', ''))
+                if val > best_labeled:
+                    best_labeled = val
+            except ValueError:
+                pass
+        if best_labeled > 0:
+            return best_labeled

    # Pass 2: maximum dollar amount across the full text
    best = 0.0
@@ -121,6 +145,19 @@ def _extract_date_from_text(text: str) -> str | None:
        if 1 <= mo <= 12 and 1 <= d <= 31:
            y = 2000 + yr if yr < 50 else 1900 + yr
            return f'{y}-{mo:02d}-{d:02d}'
+    # Month-name formats: "05 MAY 2026", "MAY 05 2026", "05 May, 2026"
+    # Common on airline, hotel, and formal business receipts.
+    m = _DATE_MON_RE.search(text)
+    if m:
+        if m.group(1):   # DD MON YYYY branch
+            d_s, mon_s, y_s = m.group(1), m.group(2), m.group(3)
+        else:            # MON DD YYYY branch
+            mon_s, d_s, y_s = m.group(4), m.group(5), m.group(6)
+        mo = _MONTH_MAP.get(mon_s.lower()[:3])
+        if mo:
+            d_i, y_i = int(d_s), int(y_s)
+            if 1 <= d_i <= 31 and 2000 <= y_i <= 2099:
+                return f'{y_i}-{mo:02d}-{d_i:02d}'
    return None

 logger = logging.getLogger(__name__)
--- a/agent_service/tools/receipt_parser.py
+++ b/agent_service/tools/receipt_parser.py
@@ -100,6 +100,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
        except Exception:
            pass  # exif_transpose requires Pillow >= 6.0

+        # ── Step 1b: Content-based rotation correction ───────────────────────
+        # EXIF transpose (Step 1) only corrects for phone-tilt metadata.
+        # If the receipt was physically laid sideways in the frame (e.g. a
+        # landscape receipt photographed with the phone upright), the pixels
+        # are genuinely rotated and EXIF can't help.  Ask Tesseract's OSD
+        # engine to detect the text orientation and rotate to correct it.
+        try:
+            osd = pytesseract.image_to_osd(img, config='--psm 0')
+            _am = re.search(r'Rotate:\s*(\d+)', osd)
+            if _am:
+                _angle = int(_am.group(1))
+                if _angle:
+                    img = img.rotate(_angle, expand=True)
+                    logger.debug('OSD: rotated %s by %d°', filename, _angle)
+        except Exception:
+            pass  # OSD unavailable or not enough text — proceed without correction
+
        # ── Step 2: Resize to working width (1800px) ──────────────────────────
        max_w = 1800
        if img.width > max_w:
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -428,6 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():

 from agent_service.agents.expenses_agent import (
    _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
+    _MONTH_MAP,
 )


@@ -473,6 +474,19 @@ class TestExtractAmount:
        text = 'Items  5.00\nTax 0.50\nTotal\n5.50'
        assert _extract_amount_from_text(text) == 5.50

+    def test_total_taxes_excluded(self):
+        # "Total Taxes $2.80" must NOT be confused with the receipt total;
+        # the labeled-total regex excludes 'total tax/taxes' via lookahead.
+        text = 'Subtotal $40.10\nTotal Taxes $2.80\nTotal $42.90'
+        assert _extract_amount_from_text(text) == 42.90
+
+    def test_pass1_returns_max_not_last(self):
+        # If OCR garbles "Total Taxes" into "Total\n$2.80", _TOTAL_RE would
+        # accidentally match twice.  max() must win over positional [-1].
+        # Simulate by giving two labeled totals where smaller appears second.
+        text = 'Grand Total $42.90\nTotal $2.80'
+        assert _extract_amount_from_text(text) == 42.90
+
    def test_total_sale_gas_station(self):
        # Costco / Shell gas receipts say "Total Sale $X.XX", not "Total: $X.XX"
        text = 'Pump  9  16.189 Gal\nRegular  $ 58.75\nTotal Sale  $ 58.75'
@@ -566,6 +580,20 @@ class TestExtractDate:
    def test_us_short_year(self):
        assert _extract_date_from_text('05/09/26') == '2026-05-09'

+    def test_dd_mon_yyyy(self):
+        # Airline / hotel receipts: "05 MAY 2026", "Issue Date: 05 May 2026"
+        assert _extract_date_from_text('Issue Date: 05 MAY 2026 MIA A70') == '2026-05-05'
+
+    def test_mon_dd_yyyy(self):
+        assert _extract_date_from_text('MAY 05 2026') == '2026-05-05'
+
+    def test_mon_dd_comma_yyyy(self):
+        assert _extract_date_from_text('May 5, 2026') == '2026-05-05'
+
+    def test_month_map_completeness(self):
+        # All twelve three-letter abbreviations must be present
+        assert len({k for k in _MONTH_MAP if len(k) == 3}) == 12
+
    def test_no_date(self):
        assert _extract_date_from_text('No date here') is None