Add EasyOCR engine for receipt image parsing
EasyOCR (deep-learning OCR) replaces Tesseract as the default engine for receipt images. It handles phone photos, thermal paper, dot-matrix fonts, and rotated images significantly better than Tesseract without requiring manual preprocessing pipelines. Key design decisions: - OCR_ENGINE=easyocr (default) | tesseract — switchable via .env, no rebuild - EasyOCR Reader is a module-level singleton: model loaded once per container start, not per receipt - Falls back to Tesseract automatically if EasyOCR fails or returns < 20 chars - EXIF rotation fix still applied before EasyOCR (phone photo orientation) - Images resized to max 2000px width for speed before passing to EasyOCR - _easyocr_to_text() groups detections into visual lines (y-overlap) and sorts left-to-right within each line for clean single-string output Revert: echo "OCR_ENGINE=tesseract" >> .env && docker compose up -d agent-service Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ import base64
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
@@ -12,6 +13,31 @@ logger = logging.getLogger(__name__)
|
||||
# Extract YYYYMMDD from filenames like 20260509_180857.jpg
|
||||
_DATE_PATTERN = re.compile(r'(\d{4})(\d{2})(\d{2})_\d{6}')
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OCR engine selection
|
||||
# ---------------------------------------------------------------------------
|
||||
# Set OCR_ENGINE=tesseract in .env to revert to the old Tesseract pipeline.
|
||||
# Default is easyocr which handles phone photos and difficult fonts better.
|
||||
|
||||
def _get_ocr_engine() -> str:
|
||||
return os.environ.get('OCR_ENGINE', 'easyocr').lower()
|
||||
|
||||
|
||||
# EasyOCR Reader is expensive to initialise (~10-30s on first call while it
|
||||
# loads model weights). Cache it as a module-level singleton so the cost is
|
||||
# paid once per container start, not once per receipt.
|
||||
_easyocr_reader = None
|
||||
|
||||
|
||||
def _get_easyocr_reader():
|
||||
global _easyocr_reader
|
||||
if _easyocr_reader is None:
|
||||
import easyocr
|
||||
logger.info('EasyOCR: initialising reader (first use — loading model weights)')
|
||||
_easyocr_reader = easyocr.Reader(['en'], verbose=False)
|
||||
logger.info('EasyOCR: reader ready')
|
||||
return _easyocr_reader
|
||||
|
||||
_MIME = {
|
||||
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png', '.gif': 'image/gif',
|
||||
@@ -80,10 +106,110 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
||||
|
||||
|
||||
def _ocr_image(data: bytes, filename: str) -> str:
|
||||
"""Extract text from a receipt image using Tesseract."""
|
||||
"""Dispatch to the configured OCR engine (EasyOCR or Tesseract)."""
|
||||
if _get_ocr_engine() == 'easyocr':
|
||||
return _ocr_image_easyocr(data, filename)
|
||||
return _ocr_image_tesseract(data, filename)
|
||||
|
||||
|
||||
def _easyocr_to_text(results: list) -> str:
|
||||
"""Convert EasyOCR result list to a single text string.
|
||||
|
||||
EasyOCR returns a list of (bbox, text, confidence) tuples. We filter
|
||||
low-confidence detections, sort top-to-bottom then left-to-right, and
|
||||
join with newlines. Receipt images are typically single-column so a
|
||||
simple y-sort produces a clean reading order.
|
||||
|
||||
Adjacent words on the same horizontal band (y within 40% of the tallest
|
||||
box's height in that group) are merged onto one line — this keeps a
|
||||
label like "TOTAL 42.90" on a single line instead of two lines,
|
||||
which is important for the labeled-total regex in expenses_agent.py.
|
||||
"""
|
||||
if not results:
|
||||
return ''
|
||||
# Filter and extract geometry
|
||||
boxes = []
|
||||
for bbox, text, conf in results:
|
||||
if conf < 0.3 or not text.strip():
|
||||
continue
|
||||
ys = [pt[1] for pt in bbox]
|
||||
xs = [pt[0] for pt in bbox]
|
||||
boxes.append({
|
||||
'y_top': min(ys), 'y_bot': max(ys),
|
||||
'x_left': min(xs), 'text': text.strip(),
|
||||
})
|
||||
if not boxes:
|
||||
return ''
|
||||
boxes.sort(key=lambda b: (b['y_top'], b['x_left']))
|
||||
|
||||
# Group into visual lines
|
||||
lines: list[list[dict]] = []
|
||||
current: list[dict] = [boxes[0]]
|
||||
for box in boxes[1:]:
|
||||
# Compute the current line's y-span
|
||||
cy_top = min(b['y_top'] for b in current)
|
||||
cy_bot = max(b['y_bot'] for b in current)
|
||||
height = max(cy_bot - cy_top, 1)
|
||||
# This box belongs to the same line if its top overlaps the current band
|
||||
if box['y_top'] < cy_bot - height * 0.3:
|
||||
current.append(box)
|
||||
else:
|
||||
lines.append(sorted(current, key=lambda b: b['x_left']))
|
||||
current = [box]
|
||||
lines.append(sorted(current, key=lambda b: b['x_left']))
|
||||
|
||||
return '\n'.join(' '.join(b['text'] for b in line) for line in lines)
|
||||
|
||||
|
||||
def _ocr_image_easyocr(data: bytes, filename: str) -> str:
|
||||
"""EasyOCR pipeline — better than Tesseract on phone photos, thermal paper,
|
||||
dot-matrix, and rotated receipts. Falls back to Tesseract on any error.
|
||||
"""
|
||||
try:
|
||||
import numpy as np
|
||||
from PIL import Image, ImageOps
|
||||
|
||||
reader = _get_easyocr_reader()
|
||||
|
||||
img = Image.open(io.BytesIO(data))
|
||||
|
||||
# EXIF rotation — same fix as the Tesseract pipeline
|
||||
try:
|
||||
img = ImageOps.exif_transpose(img)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Resize very large images for speed; EasyOCR is accurate but slow on
|
||||
# images wider than ~2000px (typical 12MP phone photo is ~4000px wide).
|
||||
max_w = 2000
|
||||
if img.width > max_w:
|
||||
scale = max_w / img.width
|
||||
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
|
||||
|
||||
# EasyOCR accepts a numpy array directly
|
||||
img_array = np.array(img)
|
||||
results = reader.readtext(img_array)
|
||||
|
||||
text = _easyocr_to_text(results)
|
||||
logger.debug('EasyOCR %s: %d chars', filename, len(text))
|
||||
|
||||
if len(text) >= 20:
|
||||
return text
|
||||
|
||||
# Very short result — try Tesseract as fallback before giving up
|
||||
logger.warning('EasyOCR %s: only %d chars, trying Tesseract fallback',
|
||||
filename, len(text))
|
||||
tess = _ocr_image_tesseract(data, filename)
|
||||
return tess if len(tess) > len(text) else text
|
||||
|
||||
except ImportError:
|
||||
logger.warning('easyocr/numpy not installed — falling back to Tesseract for %s', filename)
|
||||
return _ocr_image_tesseract(data, filename)
|
||||
except Exception as exc:
|
||||
logger.warning('EasyOCR failed for %s: %s — falling back to Tesseract', filename, exc)
|
||||
return _ocr_image_tesseract(data, filename)
|
||||
|
||||
|
||||
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||
"""Tesseract-based OCR pipeline with phone-photo preprocessing."""
|
||||
try:
|
||||
|
||||
@@ -15,5 +15,9 @@ ollama==0.3.3
|
||||
pdfplumber==0.11.4
|
||||
Pillow==10.4.0
|
||||
pytesseract==0.3.13
|
||||
# EasyOCR: deep-learning OCR, better on phone photos and difficult fonts.
|
||||
# Set OCR_ENGINE=tesseract in .env to use Tesseract instead.
|
||||
# Note: pulls in torch (~1.5GB) — only add if disk space allows.
|
||||
easyocr
|
||||
python-multipart==0.0.12
|
||||
docker==7.1.0
|
||||
|
||||
Reference in New Issue
Block a user