Improve OCR preprocessing and amount extraction robustness

Image preprocessing (receipt_parser.py): - Add ImageOps.exif_transpose() — fixes portrait photos stored with EXIF rotation metadata (most phone photos); without this Tesseract reads a rotated image and produces garbage - Upscale images < 600px wide for better character recognition - Raise binarization threshold 140→160 for faint thermal-print receipts - Try PSM 6 (single text block) before PSM 4, PSM 11 as fallbacks; PSM 6 is better suited to single-column receipt layout Amount extraction (expenses_agent.py): - Add Pass 2 bottom-of-receipt line scan when labeled Total: regex fails; reads lines bottom-to-top in the last 50% of text, skipping change/tip lines — handles 'T0TAL' OCR misread and amount-on-next-line layout - Add _SKIP_LINE_RE and _ANY_DOLLAR_RE module-level patterns - 8 new tests covering garbled total, change-skip, USD suffix, etc. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 23:33:38 -04:00
parent f1a8add84b
commit 1536d83376
3 changed files with 100 additions and 17 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -12,7 +12,7 @@ from ..tools.expenses_tools import ExpensesTools
 # Receipt OCR helpers — regex-based, deterministic extraction
 # ---------------------------------------------------------------------------

-# Matches the final-total line on a receipt.
+# Matches an explicitly labeled total line.
 # Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", etc.
 _TOTAL_RE = re.compile(
    r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
@@ -21,22 +21,60 @@ _TOTAL_RE = re.compile(
    re.IGNORECASE,
 )

+# Lines printed AFTER the total (change given, tip, etc.) — skip these
+# when doing the bottom-of-receipt scan so we don't mistake them for the total.
+_SKIP_LINE_RE = re.compile(
+    r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|'
+    r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
+    re.IGNORECASE,
+)
+
+# Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals)
+_ANY_DOLLAR_RE = re.compile(r'(?<!\d)\$?\s*([\d,]{1,6}\.\d{2})(?!\d)')
+
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY


 def _extract_amount_from_text(text: str) -> float:
-    """Return the final total from OCR receipt text, or 0.0 if not found."""
+    """Return the final total from OCR receipt text, or 0.0 if not found.
+
+    Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
+    Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text,
+             skipping change/cash/tip lines.  Handles cases where Tesseract
+             garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own
+             line below the label.
+    """
    if not text:
        return 0.0
+
+    # Pass 1: explicit label match
    matches = list(_TOTAL_RE.finditer(text))
    if matches:
-        raw = matches[-1].group(1).replace(',', '')  # last match = grand total
+        raw = matches[-1].group(1).replace(',', '')
        try:
-            return float(raw)
+            val = float(raw)
+            if val > 0:
+                return val
        except ValueError:
            pass
+
+    # Pass 2: bottom-of-receipt line scan
+    # Only search the bottom half so item prices (middle section) are excluded
+    bottom = text[max(0, int(len(text) * 0.5)):]
+    for line in reversed(bottom.splitlines()):
+        if _SKIP_LINE_RE.search(line):
+            continue
+        m = _ANY_DOLLAR_RE.search(line)
+        if m:
+            try:
+                val = float(m.group(1).replace(',', ''))
+                if val > 0:
+                    return val
+            except ValueError:
+                pass
+
    return 0.0