Improve OCR preprocessing and amount extraction robustness
Image preprocessing (receipt_parser.py): - Add ImageOps.exif_transpose() — fixes portrait photos stored with EXIF rotation metadata (most phone photos); without this Tesseract reads a rotated image and produces garbage - Upscale images < 600px wide for better character recognition - Raise binarization threshold 140→160 for faint thermal-print receipts - Try PSM 6 (single text block) before PSM 4, PSM 11 as fallbacks; PSM 6 is better suited to single-column receipt layout Amount extraction (expenses_agent.py): - Add Pass 2 bottom-of-receipt line scan when labeled Total: regex fails; reads lines bottom-to-top in the last 50% of text, skipping change/tip lines — handles 'T0TAL' OCR misread and amount-on-next-line layout - Add _SKIP_LINE_RE and _ANY_DOLLAR_RE module-level patterns - 8 new tests covering garbled total, change-skip, USD suffix, etc. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -85,34 +85,58 @@ def _ocr_image(data: bytes, filename: str) -> str:
|
||||
|
||||
|
||||
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||
"""Tesseract-based OCR pipeline (fallback)."""
|
||||
"""Tesseract-based OCR pipeline with phone-photo preprocessing."""
|
||||
try:
|
||||
from PIL import Image, ImageFilter, ImageOps
|
||||
import pytesseract
|
||||
img = Image.open(io.BytesIO(data))
|
||||
|
||||
# Resize very large images — tesseract is slower and less accurate at
|
||||
# phone-camera resolution; 1800px wide is plenty for receipt text.
|
||||
# ── Step 1: EXIF rotation correction ─────────────────────────────────
|
||||
# Phone photos are stored with EXIF orientation metadata but the pixel
|
||||
# data is not actually rotated. Without this fix Tesseract reads a
|
||||
# portrait receipt as a landscape image and produces garbage.
|
||||
try:
|
||||
img = ImageOps.exif_transpose(img)
|
||||
except Exception:
|
||||
pass # exif_transpose requires Pillow >= 6.0
|
||||
|
||||
# ── Step 2: Resize to working width (1800px) ──────────────────────────
|
||||
max_w = 1800
|
||||
if img.width > max_w:
|
||||
scale = max_w / img.width
|
||||
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
|
||||
# Upscale very small images — Tesseract accuracy drops below ~600px
|
||||
elif img.width < 600:
|
||||
scale = 600 / img.width
|
||||
img = img.resize((600, int(img.height * scale)), Image.LANCZOS)
|
||||
|
||||
# Grayscale + adaptive binarisation + sharpen
|
||||
# ── Step 3: Grayscale + contrast ─────────────────────────────────────
|
||||
img = ImageOps.grayscale(img)
|
||||
img = ImageOps.autocontrast(img)
|
||||
img = img.point(lambda x: 0 if x < 140 else 255)
|
||||
|
||||
# ── Step 4: Sharpen then binarize ─────────────────────────────────────
|
||||
# Sharpen first so edges are crisp before thresholding.
|
||||
# Threshold 160 (was 140) — gentler for faint thermal-print receipts
|
||||
# where light gray text would be wiped out by the stricter threshold.
|
||||
img = img.filter(ImageFilter.SHARPEN)
|
||||
img = img.point(lambda x: 0 if x < 160 else 255)
|
||||
|
||||
# psm 1 = automatic page segmentation + OSD (handles rotated receipts).
|
||||
# Fall back to psm 6 if OSD data is missing.
|
||||
try:
|
||||
text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip()
|
||||
except Exception:
|
||||
text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip()
|
||||
# ── Step 5: OCR — try PSM modes best-suited for receipt layout ────────
|
||||
# PSM 6 = single uniform text block (best for single-column receipts)
|
||||
# PSM 4 = single column, variable text sizes (wider fallback)
|
||||
# PSM 11 = sparse text — last resort for badly segmented images
|
||||
for psm in (6, 4, 11):
|
||||
try:
|
||||
text = pytesseract.image_to_string(
|
||||
img, config=f'--oem 3 --psm {psm}').strip()
|
||||
if len(text) >= 20:
|
||||
logger.debug('Tesseract OCR %s: psm=%d %d chars', filename, psm, len(text))
|
||||
return text
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.debug('Tesseract OCR %s: %d chars', filename, len(text))
|
||||
return text
|
||||
logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename)
|
||||
return ''
|
||||
except ImportError:
|
||||
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
||||
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
||||
|
||||
Reference in New Issue
Block a user