From ece811cccb57b4cffc3601e1c99a1c465213eb7d Mon Sep 17 00:00:00 2001
From: Carlos Garcia <tocmo@DESKTOP-O6U0UOS.AVC.local>
Date: Thu, 21 May 2026 00:46:08 -0400
Subject: [PATCH] fix(expenses): LAYAL CAFE $2.80 bug, United Airlines rotation
 & date

LAYAL CAFE ($2.80 instead of $42.90):
- Add (?!\s*tax) lookahead to _TOTAL_RE so "Total Taxes $2.80" is never
  confused with the receipt total when OCR drops the "Taxes" word
- Change Pass 1 from matches[-1] to max() so the largest labeled amount
  always wins, regardless of line order in the OCR output

United Airlines (Subway/$0/wrong date):
- Add OSD-based rotation correction in receipt_parser.py: after EXIF
  transpose, ask Tesseract's orientation-detection engine (--psm 0) what
  angle to rotate; applies to receipts photographed lying sideways where
  EXIF metadata cannot help
- Add month-name date patterns (DD MON YYYY / MON DD YYYY) to
  _extract_date_from_text for airline/hotel receipts that print dates
  like "05 MAY 2026" instead of "05/07/26"

85 tests, all passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 agent_service/agents/expenses_agent.py | 53 ++++++++++++++++++++++----
 agent_service/tools/receipt_parser.py  | 17 +++++++++
 tests/test_expenses_agent.py           | 28 ++++++++++++++
 3 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py
index 2fa016a..3eca3ec 100644
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -15,10 +15,16 @@ from ..tools.expenses_tools import ExpensesTools
 # Matches an explicitly labeled total line.
 # Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46",
 # "Total Sale $58.75" (gas stations), "Net Sale $X", etc.
+#
+# The negative lookahead (?!\s*tax) prevents "Total Tax" / "Total Taxes"
+# (a sub-total line present on restaurant receipts) from being confused
+# with the final total when Tesseract splits a two-column label+amount
+# layout across lines.
 _TOTAL_RE = re.compile(
     r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
     r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
     r'sale\s*total|you\s*paid|amount\s*paid|total)'
+    r'(?!\s*tax)'                       # exclude "Total Tax / Total Taxes"
     r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
     re.IGNORECASE,
 )
@@ -52,6 +58,18 @@ def _is_likely_bank_statement(text: str) -> bool:
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY
+# "05 MAY 2026"  or  "MAY 05 2026"  or  "05 May, 2026" (airline / hotel receipts)
+_DATE_MON_RE = re.compile(
+    r'\b(\d{1,2})\s+([A-Za-z]{3,9})[,\s]+(\d{4})\b'   # DD MON YYYY
+    r'|\b([A-Za-z]{3,9})\s+(\d{1,2})[,\s]+(\d{4})\b',  # MON DD YYYY
+)
+_MONTH_MAP: dict[str, int] = {
+    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
+    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
+    'january': 1, 'february': 2, 'march': 3, 'april': 4,
+    'june': 6, 'july': 7, 'august': 8, 'september': 9,
+    'october': 10, 'november': 11, 'december': 12,
+}
 
 
 def _extract_amount_from_text(text: str) -> float:
@@ -71,16 +89,22 @@ def _extract_amount_from_text(text: str) -> float:
     if not text:
         return 0.0
 
-    # Pass 1: explicit label match
+    # Pass 1: explicit label match — return the LARGEST labeled amount.
+    # Using max() rather than the last positional match handles the common
+    # OCR artefact where "Total\n$2.80" (garbled "Total Taxes") appears
+    # before "Total\n$42.90" in the text; the actual total wins on value.
     matches = list(_TOTAL_RE.finditer(text))
     if matches:
-        raw = matches[-1].group(1).replace(',', '')
-        try:
-            val = float(raw)
-            if val > 0:
-                return val
-        except ValueError:
-            pass
+        best_labeled = 0.0
+        for m in matches:
+            try:
+                val = float(m.group(1).replace(',', ''))
+                if val > best_labeled:
+                    best_labeled = val
+            except ValueError:
+                pass
+        if best_labeled > 0:
+            return best_labeled
 
     # Pass 2: maximum dollar amount across the full text
     best = 0.0
@@ -121,6 +145,19 @@ def _extract_date_from_text(text: str) -> str | None:
         if 1 <= mo <= 12 and 1 <= d <= 31:
             y = 2000 + yr if yr < 50 else 1900 + yr
             return f'{y}-{mo:02d}-{d:02d}'
+    # Month-name formats: "05 MAY 2026", "MAY 05 2026", "05 May, 2026"
+    # Common on airline, hotel, and formal business receipts.
+    m = _DATE_MON_RE.search(text)
+    if m:
+        if m.group(1):   # DD MON YYYY branch
+            d_s, mon_s, y_s = m.group(1), m.group(2), m.group(3)
+        else:            # MON DD YYYY branch
+            mon_s, d_s, y_s = m.group(4), m.group(5), m.group(6)
+        mo = _MONTH_MAP.get(mon_s.lower()[:3])
+        if mo:
+            d_i, y_i = int(d_s), int(y_s)
+            if 1 <= d_i <= 31 and 2000 <= y_i <= 2099:
+                return f'{y_i}-{mo:02d}-{d_i:02d}'
     return None
 
 logger = logging.getLogger(__name__)
diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py
index 1598ab4..e366e95 100644
--- a/agent_service/tools/receipt_parser.py
+++ b/agent_service/tools/receipt_parser.py
@@ -100,6 +100,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
         except Exception:
             pass  # exif_transpose requires Pillow >= 6.0
 
+        # ── Step 1b: Content-based rotation correction ───────────────────────
+        # EXIF transpose (Step 1) only corrects for phone-tilt metadata.
+        # If the receipt was physically laid sideways in the frame (e.g. a
+        # landscape receipt photographed with the phone upright), the pixels
+        # are genuinely rotated and EXIF can't help.  Ask Tesseract's OSD
+        # engine to detect the text orientation and rotate to correct it.
+        try:
+            osd = pytesseract.image_to_osd(img, config='--psm 0')
+            _am = re.search(r'Rotate:\s*(\d+)', osd)
+            if _am:
+                _angle = int(_am.group(1))
+                if _angle:
+                    img = img.rotate(_angle, expand=True)
+                    logger.debug('OSD: rotated %s by %d°', filename, _angle)
+        except Exception:
+            pass  # OSD unavailable or not enough text — proceed without correction
+
         # ── Step 2: Resize to working width (1800px) ──────────────────────────
         max_w = 1800
         if img.width > max_w:
diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py
index 86e583c..8f45a8b 100644
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -428,6 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
 
 from agent_service.agents.expenses_agent import (
     _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
+    _MONTH_MAP,
 )
 
 
@@ -473,6 +474,19 @@ class TestExtractAmount:
         text = 'Items  5.00\nTax 0.50\nTotal\n5.50'
         assert _extract_amount_from_text(text) == 5.50
 
+    def test_total_taxes_excluded(self):
+        # "Total Taxes $2.80" must NOT be confused with the receipt total;
+        # the labeled-total regex excludes 'total tax/taxes' via lookahead.
+        text = 'Subtotal $40.10\nTotal Taxes $2.80\nTotal $42.90'
+        assert _extract_amount_from_text(text) == 42.90
+
+    def test_pass1_returns_max_not_last(self):
+        # If OCR garbles "Total Taxes" into "Total\n$2.80", _TOTAL_RE would
+        # accidentally match twice.  max() must win over positional [-1].
+        # Simulate by giving two labeled totals where smaller appears second.
+        text = 'Grand Total $42.90\nTotal $2.80'
+        assert _extract_amount_from_text(text) == 42.90
+
     def test_total_sale_gas_station(self):
         # Costco / Shell gas receipts say "Total Sale $X.XX", not "Total: $X.XX"
         text = 'Pump  9  16.189 Gal\nRegular  $ 58.75\nTotal Sale  $ 58.75'
@@ -566,6 +580,20 @@ class TestExtractDate:
     def test_us_short_year(self):
         assert _extract_date_from_text('05/09/26') == '2026-05-09'
 
+    def test_dd_mon_yyyy(self):
+        # Airline / hotel receipts: "05 MAY 2026", "Issue Date: 05 May 2026"
+        assert _extract_date_from_text('Issue Date: 05 MAY 2026 MIA A70') == '2026-05-05'
+
+    def test_mon_dd_yyyy(self):
+        assert _extract_date_from_text('MAY 05 2026') == '2026-05-05'
+
+    def test_mon_dd_comma_yyyy(self):
+        assert _extract_date_from_text('May 5, 2026') == '2026-05-05'
+
+    def test_month_map_completeness(self):
+        # All twelve three-letter abbreviations must be present
+        assert len({k for k in _MONTH_MAP if len(k) == 3}) == 12
+
     def test_no_date(self):
         assert _extract_date_from_text('No date here') is None