Add EasyOCR engine for receipt image parsing

EasyOCR (deep-learning OCR) replaces Tesseract as the default engine for
receipt images. It handles phone photos, thermal paper, dot-matrix fonts,
and rotated images significantly better than Tesseract without requiring
manual preprocessing pipelines.

Key design decisions:
- OCR_ENGINE=easyocr (default) | tesseract — switchable via .env, no rebuild
- EasyOCR Reader is a module-level singleton: model loaded once per container
  start, not per receipt
- Falls back to Tesseract automatically if EasyOCR fails or returns < 20 chars
- EXIF rotation fix still applied before EasyOCR (phone photo orientation)
- Images resized to max 2000px width for speed before passing to EasyOCR
- _easyocr_to_text() groups detections into visual lines (y-overlap) and
  sorts left-to-right within each line for clean single-string output

Revert: echo "OCR_ENGINE=tesseract" >> .env && docker compose up -d agent-service

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 01:22:22 -04:00
parent a736f3352b
commit 69519393c1
2 changed files with 131 additions and 1 deletions

View File

@@ -3,6 +3,7 @@ import base64
import hashlib import hashlib
import io import io
import logging import logging
import os
import re import re
import zipfile import zipfile
from pathlib import Path from pathlib import Path
@@ -12,6 +13,31 @@ logger = logging.getLogger(__name__)
# Extract YYYYMMDD from filenames like 20260509_180857.jpg # Extract YYYYMMDD from filenames like 20260509_180857.jpg
_DATE_PATTERN = re.compile(r'(\d{4})(\d{2})(\d{2})_\d{6}') _DATE_PATTERN = re.compile(r'(\d{4})(\d{2})(\d{2})_\d{6}')
# ---------------------------------------------------------------------------
# OCR engine selection
# ---------------------------------------------------------------------------
# Set OCR_ENGINE=tesseract in .env to revert to the old Tesseract pipeline.
# Default is easyocr which handles phone photos and difficult fonts better.
def _get_ocr_engine() -> str:
return os.environ.get('OCR_ENGINE', 'easyocr').lower()
# EasyOCR Reader is expensive to initialise (~10-30s on first call while it
# loads model weights). Cache it as a module-level singleton so the cost is
# paid once per container start, not once per receipt.
_easyocr_reader = None
def _get_easyocr_reader():
global _easyocr_reader
if _easyocr_reader is None:
import easyocr
logger.info('EasyOCR: initialising reader (first use — loading model weights)')
_easyocr_reader = easyocr.Reader(['en'], verbose=False)
logger.info('EasyOCR: reader ready')
return _easyocr_reader
_MIME = { _MIME = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
'.png': 'image/png', '.gif': 'image/gif', '.png': 'image/png', '.gif': 'image/gif',
@@ -80,10 +106,110 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
def _ocr_image(data: bytes, filename: str) -> str: def _ocr_image(data: bytes, filename: str) -> str:
"""Extract text from a receipt image using Tesseract.""" """Dispatch to the configured OCR engine (EasyOCR or Tesseract)."""
if _get_ocr_engine() == 'easyocr':
return _ocr_image_easyocr(data, filename)
return _ocr_image_tesseract(data, filename) return _ocr_image_tesseract(data, filename)
def _easyocr_to_text(results: list) -> str:
"""Convert EasyOCR result list to a single text string.
EasyOCR returns a list of (bbox, text, confidence) tuples. We filter
low-confidence detections, sort top-to-bottom then left-to-right, and
join with newlines. Receipt images are typically single-column so a
simple y-sort produces a clean reading order.
Adjacent words on the same horizontal band (y within 40% of the tallest
box's height in that group) are merged onto one line — this keeps a
label like "TOTAL 42.90" on a single line instead of two lines,
which is important for the labeled-total regex in expenses_agent.py.
"""
if not results:
return ''
# Filter and extract geometry
boxes = []
for bbox, text, conf in results:
if conf < 0.3 or not text.strip():
continue
ys = [pt[1] for pt in bbox]
xs = [pt[0] for pt in bbox]
boxes.append({
'y_top': min(ys), 'y_bot': max(ys),
'x_left': min(xs), 'text': text.strip(),
})
if not boxes:
return ''
boxes.sort(key=lambda b: (b['y_top'], b['x_left']))
# Group into visual lines
lines: list[list[dict]] = []
current: list[dict] = [boxes[0]]
for box in boxes[1:]:
# Compute the current line's y-span
cy_top = min(b['y_top'] for b in current)
cy_bot = max(b['y_bot'] for b in current)
height = max(cy_bot - cy_top, 1)
# This box belongs to the same line if its top overlaps the current band
if box['y_top'] < cy_bot - height * 0.3:
current.append(box)
else:
lines.append(sorted(current, key=lambda b: b['x_left']))
current = [box]
lines.append(sorted(current, key=lambda b: b['x_left']))
return '\n'.join(' '.join(b['text'] for b in line) for line in lines)
def _ocr_image_easyocr(data: bytes, filename: str) -> str:
"""EasyOCR pipeline — better than Tesseract on phone photos, thermal paper,
dot-matrix, and rotated receipts. Falls back to Tesseract on any error.
"""
try:
import numpy as np
from PIL import Image, ImageOps
reader = _get_easyocr_reader()
img = Image.open(io.BytesIO(data))
# EXIF rotation — same fix as the Tesseract pipeline
try:
img = ImageOps.exif_transpose(img)
except Exception:
pass
# Resize very large images for speed; EasyOCR is accurate but slow on
# images wider than ~2000px (typical 12MP phone photo is ~4000px wide).
max_w = 2000
if img.width > max_w:
scale = max_w / img.width
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
# EasyOCR accepts a numpy array directly
img_array = np.array(img)
results = reader.readtext(img_array)
text = _easyocr_to_text(results)
logger.debug('EasyOCR %s: %d chars', filename, len(text))
if len(text) >= 20:
return text
# Very short result — try Tesseract as fallback before giving up
logger.warning('EasyOCR %s: only %d chars, trying Tesseract fallback',
filename, len(text))
tess = _ocr_image_tesseract(data, filename)
return tess if len(tess) > len(text) else text
except ImportError:
logger.warning('easyocr/numpy not installed — falling back to Tesseract for %s', filename)
return _ocr_image_tesseract(data, filename)
except Exception as exc:
logger.warning('EasyOCR failed for %s: %s — falling back to Tesseract', filename, exc)
return _ocr_image_tesseract(data, filename)
def _ocr_image_tesseract(data: bytes, filename: str) -> str: def _ocr_image_tesseract(data: bytes, filename: str) -> str:
"""Tesseract-based OCR pipeline with phone-photo preprocessing.""" """Tesseract-based OCR pipeline with phone-photo preprocessing."""
try: try:

View File

@@ -15,5 +15,9 @@ ollama==0.3.3
pdfplumber==0.11.4 pdfplumber==0.11.4
Pillow==10.4.0 Pillow==10.4.0
pytesseract==0.3.13 pytesseract==0.3.13
# EasyOCR: deep-learning OCR, better on phone photos and difficult fonts.
# Set OCR_ENGINE=tesseract in .env to use Tesseract instead.
# Note: pulls in torch (~1.5GB) — only add if disk space allows.
easyocr
python-multipart==0.0.12 python-multipart==0.0.12
docker==7.1.0 docker==7.1.0