From 69519393c10673bf9b395d8337e6b31d601d0e69 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Thu, 21 May 2026 01:22:22 -0400 Subject: [PATCH] Add EasyOCR engine for receipt image parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit EasyOCR (deep-learning OCR) replaces Tesseract as the default engine for receipt images. It handles phone photos, thermal paper, dot-matrix fonts, and rotated images significantly better than Tesseract without requiring manual preprocessing pipelines. Key design decisions: - OCR_ENGINE=easyocr (default) | tesseract — switchable via .env, no rebuild - EasyOCR Reader is a module-level singleton: model loaded once per container start, not per receipt - Falls back to Tesseract automatically if EasyOCR fails or returns < 20 chars - EXIF rotation fix still applied before EasyOCR (phone photo orientation) - Images resized to max 2000px width for speed before passing to EasyOCR - _easyocr_to_text() groups detections into visual lines (y-overlap) and sorts left-to-right within each line for clean single-string output Revert: echo "OCR_ENGINE=tesseract" >> .env && docker compose up -d agent-service Co-Authored-By: Claude Sonnet 4.6 --- agent_service/tools/receipt_parser.py | 128 +++++++++++++++++++++++++- requirements.txt | 4 + 2 files changed, 131 insertions(+), 1 deletion(-) diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index 1e26e50..7b14831 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -3,6 +3,7 @@ import base64 import hashlib import io import logging +import os import re import zipfile from pathlib import Path @@ -12,6 +13,31 @@ logger = logging.getLogger(__name__) # Extract YYYYMMDD from filenames like 20260509_180857.jpg _DATE_PATTERN = re.compile(r'(\d{4})(\d{2})(\d{2})_\d{6}') +# --------------------------------------------------------------------------- +# OCR engine selection +# --------------------------------------------------------------------------- +# Set OCR_ENGINE=tesseract in .env to revert to the old Tesseract pipeline. +# Default is easyocr which handles phone photos and difficult fonts better. + +def _get_ocr_engine() -> str: + return os.environ.get('OCR_ENGINE', 'easyocr').lower() + + +# EasyOCR Reader is expensive to initialise (~10-30s on first call while it +# loads model weights). Cache it as a module-level singleton so the cost is +# paid once per container start, not once per receipt. +_easyocr_reader = None + + +def _get_easyocr_reader(): + global _easyocr_reader + if _easyocr_reader is None: + import easyocr + logger.info('EasyOCR: initialising reader (first use — loading model weights)') + _easyocr_reader = easyocr.Reader(['en'], verbose=False) + logger.info('EasyOCR: reader ready') + return _easyocr_reader + _MIME = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', @@ -80,10 +106,110 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]: def _ocr_image(data: bytes, filename: str) -> str: - """Extract text from a receipt image using Tesseract.""" + """Dispatch to the configured OCR engine (EasyOCR or Tesseract).""" + if _get_ocr_engine() == 'easyocr': + return _ocr_image_easyocr(data, filename) return _ocr_image_tesseract(data, filename) +def _easyocr_to_text(results: list) -> str: + """Convert EasyOCR result list to a single text string. + + EasyOCR returns a list of (bbox, text, confidence) tuples. We filter + low-confidence detections, sort top-to-bottom then left-to-right, and + join with newlines. Receipt images are typically single-column so a + simple y-sort produces a clean reading order. + + Adjacent words on the same horizontal band (y within 40% of the tallest + box's height in that group) are merged onto one line — this keeps a + label like "TOTAL 42.90" on a single line instead of two lines, + which is important for the labeled-total regex in expenses_agent.py. + """ + if not results: + return '' + # Filter and extract geometry + boxes = [] + for bbox, text, conf in results: + if conf < 0.3 or not text.strip(): + continue + ys = [pt[1] for pt in bbox] + xs = [pt[0] for pt in bbox] + boxes.append({ + 'y_top': min(ys), 'y_bot': max(ys), + 'x_left': min(xs), 'text': text.strip(), + }) + if not boxes: + return '' + boxes.sort(key=lambda b: (b['y_top'], b['x_left'])) + + # Group into visual lines + lines: list[list[dict]] = [] + current: list[dict] = [boxes[0]] + for box in boxes[1:]: + # Compute the current line's y-span + cy_top = min(b['y_top'] for b in current) + cy_bot = max(b['y_bot'] for b in current) + height = max(cy_bot - cy_top, 1) + # This box belongs to the same line if its top overlaps the current band + if box['y_top'] < cy_bot - height * 0.3: + current.append(box) + else: + lines.append(sorted(current, key=lambda b: b['x_left'])) + current = [box] + lines.append(sorted(current, key=lambda b: b['x_left'])) + + return '\n'.join(' '.join(b['text'] for b in line) for line in lines) + + +def _ocr_image_easyocr(data: bytes, filename: str) -> str: + """EasyOCR pipeline — better than Tesseract on phone photos, thermal paper, + dot-matrix, and rotated receipts. Falls back to Tesseract on any error. + """ + try: + import numpy as np + from PIL import Image, ImageOps + + reader = _get_easyocr_reader() + + img = Image.open(io.BytesIO(data)) + + # EXIF rotation — same fix as the Tesseract pipeline + try: + img = ImageOps.exif_transpose(img) + except Exception: + pass + + # Resize very large images for speed; EasyOCR is accurate but slow on + # images wider than ~2000px (typical 12MP phone photo is ~4000px wide). + max_w = 2000 + if img.width > max_w: + scale = max_w / img.width + img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS) + + # EasyOCR accepts a numpy array directly + img_array = np.array(img) + results = reader.readtext(img_array) + + text = _easyocr_to_text(results) + logger.debug('EasyOCR %s: %d chars', filename, len(text)) + + if len(text) >= 20: + return text + + # Very short result — try Tesseract as fallback before giving up + logger.warning('EasyOCR %s: only %d chars, trying Tesseract fallback', + filename, len(text)) + tess = _ocr_image_tesseract(data, filename) + return tess if len(tess) > len(text) else text + + except ImportError: + logger.warning('easyocr/numpy not installed — falling back to Tesseract for %s', filename) + return _ocr_image_tesseract(data, filename) + except Exception as exc: + logger.warning('EasyOCR failed for %s: %s — falling back to Tesseract', filename, exc) + return _ocr_image_tesseract(data, filename) + + def _ocr_image_tesseract(data: bytes, filename: str) -> str: """Tesseract-based OCR pipeline with phone-photo preprocessing.""" try: diff --git a/requirements.txt b/requirements.txt index bdc3d55..5d3e9d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,5 +15,9 @@ ollama==0.3.3 pdfplumber==0.11.4 Pillow==10.4.0 pytesseract==0.3.13 +# EasyOCR: deep-learning OCR, better on phone photos and difficult fonts. +# Set OCR_ENGINE=tesseract in .env to use Tesseract instead. +# Note: pulls in torch (~1.5GB) — only add if disk space allows. +easyocr python-multipart==0.0.12 docker==7.1.0