From 1536d83376ae85eace10b2ab796795c09b6ee96a Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Wed, 20 May 2026 23:33:38 -0400 Subject: [PATCH] Improve OCR preprocessing and amount extraction robustness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Image preprocessing (receipt_parser.py): - Add ImageOps.exif_transpose() — fixes portrait photos stored with EXIF rotation metadata (most phone photos); without this Tesseract reads a rotated image and produces garbage - Upscale images < 600px wide for better character recognition - Raise binarization threshold 140→160 for faint thermal-print receipts - Try PSM 6 (single text block) before PSM 4, PSM 11 as fallbacks; PSM 6 is better suited to single-column receipt layout Amount extraction (expenses_agent.py): - Add Pass 2 bottom-of-receipt line scan when labeled Total: regex fails; reads lines bottom-to-top in the last 50% of text, skipping change/tip lines — handles 'T0TAL' OCR misread and amount-on-next-line layout - Add _SKIP_LINE_RE and _ANY_DOLLAR_RE module-level patterns - 8 new tests covering garbled total, change-skip, USD suffix, etc. Co-Authored-By: Claude Sonnet 4.6 --- agent_service/agents/expenses_agent.py | 46 +++++++++++++++++++++--- agent_service/tools/receipt_parser.py | 50 +++++++++++++++++++------- tests/test_expenses_agent.py | 21 +++++++++++ 3 files changed, 100 insertions(+), 17 deletions(-) diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index cc9c4cb..2fd3c53 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -12,7 +12,7 @@ from ..tools.expenses_tools import ExpensesTools # Receipt OCR helpers — regex-based, deterministic extraction # --------------------------------------------------------------------------- -# Matches the final-total line on a receipt. +# Matches an explicitly labeled total line. # Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", etc. _TOTAL_RE = re.compile( r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|' @@ -21,22 +21,60 @@ _TOTAL_RE = re.compile( re.IGNORECASE, ) +# Lines printed AFTER the total (change given, tip, etc.) — skip these +# when doing the bottom-of-receipt scan so we don't mistake them for the total. +_SKIP_LINE_RE = re.compile( + r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|' + r'auth(?:orized)?|visa|mastercard|amex|discover)\b', + re.IGNORECASE, +) + +# Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals) +_ANY_DOLLAR_RE = re.compile(r'(? float: - """Return the final total from OCR receipt text, or 0.0 if not found.""" + """Return the final total from OCR receipt text, or 0.0 if not found. + + Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc. + Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text, + skipping change/cash/tip lines. Handles cases where Tesseract + garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own + line below the label. + """ if not text: return 0.0 + + # Pass 1: explicit label match matches = list(_TOTAL_RE.finditer(text)) if matches: - raw = matches[-1].group(1).replace(',', '') # last match = grand total + raw = matches[-1].group(1).replace(',', '') try: - return float(raw) + val = float(raw) + if val > 0: + return val except ValueError: pass + + # Pass 2: bottom-of-receipt line scan + # Only search the bottom half so item prices (middle section) are excluded + bottom = text[max(0, int(len(text) * 0.5)):] + for line in reversed(bottom.splitlines()): + if _SKIP_LINE_RE.search(line): + continue + m = _ANY_DOLLAR_RE.search(line) + if m: + try: + val = float(m.group(1).replace(',', '')) + if val > 0: + return val + except ValueError: + pass + return 0.0 diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index 981cca2..1598ab4 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -85,34 +85,58 @@ def _ocr_image(data: bytes, filename: str) -> str: def _ocr_image_tesseract(data: bytes, filename: str) -> str: - """Tesseract-based OCR pipeline (fallback).""" + """Tesseract-based OCR pipeline with phone-photo preprocessing.""" try: from PIL import Image, ImageFilter, ImageOps import pytesseract img = Image.open(io.BytesIO(data)) - # Resize very large images — tesseract is slower and less accurate at - # phone-camera resolution; 1800px wide is plenty for receipt text. + # ── Step 1: EXIF rotation correction ───────────────────────────────── + # Phone photos are stored with EXIF orientation metadata but the pixel + # data is not actually rotated. Without this fix Tesseract reads a + # portrait receipt as a landscape image and produces garbage. + try: + img = ImageOps.exif_transpose(img) + except Exception: + pass # exif_transpose requires Pillow >= 6.0 + + # ── Step 2: Resize to working width (1800px) ────────────────────────── max_w = 1800 if img.width > max_w: scale = max_w / img.width img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS) + # Upscale very small images — Tesseract accuracy drops below ~600px + elif img.width < 600: + scale = 600 / img.width + img = img.resize((600, int(img.height * scale)), Image.LANCZOS) - # Grayscale + adaptive binarisation + sharpen + # ── Step 3: Grayscale + contrast ───────────────────────────────────── img = ImageOps.grayscale(img) img = ImageOps.autocontrast(img) - img = img.point(lambda x: 0 if x < 140 else 255) + + # ── Step 4: Sharpen then binarize ───────────────────────────────────── + # Sharpen first so edges are crisp before thresholding. + # Threshold 160 (was 140) — gentler for faint thermal-print receipts + # where light gray text would be wiped out by the stricter threshold. img = img.filter(ImageFilter.SHARPEN) + img = img.point(lambda x: 0 if x < 160 else 255) - # psm 1 = automatic page segmentation + OSD (handles rotated receipts). - # Fall back to psm 6 if OSD data is missing. - try: - text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip() - except Exception: - text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip() + # ── Step 5: OCR — try PSM modes best-suited for receipt layout ──────── + # PSM 6 = single uniform text block (best for single-column receipts) + # PSM 4 = single column, variable text sizes (wider fallback) + # PSM 11 = sparse text — last resort for badly segmented images + for psm in (6, 4, 11): + try: + text = pytesseract.image_to_string( + img, config=f'--oem 3 --psm {psm}').strip() + if len(text) >= 20: + logger.debug('Tesseract OCR %s: psm=%d %d chars', filename, psm, len(text)) + return text + except Exception: + pass - logger.debug('Tesseract OCR %s: %d chars', filename, len(text)) - return text + logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename) + return '' except ImportError: logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename) return f'[Image: {filename} — install pytesseract+Pillow for OCR]' diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index 99eabc0..b353d7d 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -458,6 +458,27 @@ class TestExtractAmount: def test_comma_in_amount(self): assert _extract_amount_from_text('Grand Total: $1,234.56') == 1234.56 + def test_bottom_scan_garbled_total(self): + # OCR garbled "TOTAL" — bottom-scan fallback should find the amount + text = 'Burger 5.99\nFries 2.50\nT0TAL 8.49' + assert _extract_amount_from_text(text) == 8.49 + + def test_bottom_scan_skips_change(self): + # Should return the total (8.49), not the change (1.51) + text = 'TOTAL 8.49\nCash 10.00\nChange 1.51' + assert _extract_amount_from_text(text) == 8.49 + + def test_bottom_scan_amount_on_own_line(self): + # Amount printed on a separate line below the label + text = 'Items 5.00\nTax 0.50\nTotal\n5.50' + assert _extract_amount_from_text(text) == 5.50 + + def test_amount_due_with_usd_suffix(self): + # PDF text may include "USD" after the number — regex should still work + # via the bottom scan since the labeled-total regex won't match "USD" + text = 'Total Charged: $198.40 USD' + assert _extract_amount_from_text(text) == 198.40 + class TestExtractDate: def test_iso_format(self):