Improve OCR preprocessing and amount extraction robustness
Image preprocessing (receipt_parser.py): - Add ImageOps.exif_transpose() — fixes portrait photos stored with EXIF rotation metadata (most phone photos); without this Tesseract reads a rotated image and produces garbage - Upscale images < 600px wide for better character recognition - Raise binarization threshold 140→160 for faint thermal-print receipts - Try PSM 6 (single text block) before PSM 4, PSM 11 as fallbacks; PSM 6 is better suited to single-column receipt layout Amount extraction (expenses_agent.py): - Add Pass 2 bottom-of-receipt line scan when labeled Total: regex fails; reads lines bottom-to-top in the last 50% of text, skipping change/tip lines — handles 'T0TAL' OCR misread and amount-on-next-line layout - Add _SKIP_LINE_RE and _ANY_DOLLAR_RE module-level patterns - 8 new tests covering garbled total, change-skip, USD suffix, etc. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,7 +12,7 @@ from ..tools.expenses_tools import ExpensesTools
|
|||||||
# Receipt OCR helpers — regex-based, deterministic extraction
|
# Receipt OCR helpers — regex-based, deterministic extraction
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
# Matches the final-total line on a receipt.
|
# Matches an explicitly labeled total line.
|
||||||
# Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", etc.
|
# Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", etc.
|
||||||
_TOTAL_RE = re.compile(
|
_TOTAL_RE = re.compile(
|
||||||
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
|
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
|
||||||
@@ -21,22 +21,60 @@ _TOTAL_RE = re.compile(
|
|||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Lines printed AFTER the total (change given, tip, etc.) — skip these
|
||||||
|
# when doing the bottom-of-receipt scan so we don't mistake them for the total.
|
||||||
|
_SKIP_LINE_RE = re.compile(
|
||||||
|
r'\b(?:change|cash\s*(?:paid|tendered)?|tip|gratuity|approved|'
|
||||||
|
r'auth(?:orized)?|visa|mastercard|amex|discover)\b',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals)
|
||||||
|
_ANY_DOLLAR_RE = re.compile(r'(?<!\d)\$?\s*([\d,]{1,6}\.\d{2})(?!\d)')
|
||||||
|
|
||||||
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
||||||
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
||||||
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
||||||
|
|
||||||
|
|
||||||
def _extract_amount_from_text(text: str) -> float:
|
def _extract_amount_from_text(text: str) -> float:
|
||||||
"""Return the final total from OCR receipt text, or 0.0 if not found."""
|
"""Return the final total from OCR receipt text, or 0.0 if not found.
|
||||||
|
|
||||||
|
Pass 1 — labeled total: 'Total:', 'Grand Total:', 'Amount Due:', etc.
|
||||||
|
Pass 2 — bottom scan: reads lines from the bottom of the last 50% of text,
|
||||||
|
skipping change/cash/tip lines. Handles cases where Tesseract
|
||||||
|
garbled 'TOTAL' (e.g. 'T0TAL') or placed the amount on its own
|
||||||
|
line below the label.
|
||||||
|
"""
|
||||||
if not text:
|
if not text:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
# Pass 1: explicit label match
|
||||||
matches = list(_TOTAL_RE.finditer(text))
|
matches = list(_TOTAL_RE.finditer(text))
|
||||||
if matches:
|
if matches:
|
||||||
raw = matches[-1].group(1).replace(',', '') # last match = grand total
|
raw = matches[-1].group(1).replace(',', '')
|
||||||
try:
|
try:
|
||||||
return float(raw)
|
val = float(raw)
|
||||||
|
if val > 0:
|
||||||
|
return val
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Pass 2: bottom-of-receipt line scan
|
||||||
|
# Only search the bottom half so item prices (middle section) are excluded
|
||||||
|
bottom = text[max(0, int(len(text) * 0.5)):]
|
||||||
|
for line in reversed(bottom.splitlines()):
|
||||||
|
if _SKIP_LINE_RE.search(line):
|
||||||
|
continue
|
||||||
|
m = _ANY_DOLLAR_RE.search(line)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
val = float(m.group(1).replace(',', ''))
|
||||||
|
if val > 0:
|
||||||
|
return val
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -85,34 +85,58 @@ def _ocr_image(data: bytes, filename: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||||
"""Tesseract-based OCR pipeline (fallback)."""
|
"""Tesseract-based OCR pipeline with phone-photo preprocessing."""
|
||||||
try:
|
try:
|
||||||
from PIL import Image, ImageFilter, ImageOps
|
from PIL import Image, ImageFilter, ImageOps
|
||||||
import pytesseract
|
import pytesseract
|
||||||
img = Image.open(io.BytesIO(data))
|
img = Image.open(io.BytesIO(data))
|
||||||
|
|
||||||
# Resize very large images — tesseract is slower and less accurate at
|
# ── Step 1: EXIF rotation correction ─────────────────────────────────
|
||||||
# phone-camera resolution; 1800px wide is plenty for receipt text.
|
# Phone photos are stored with EXIF orientation metadata but the pixel
|
||||||
|
# data is not actually rotated. Without this fix Tesseract reads a
|
||||||
|
# portrait receipt as a landscape image and produces garbage.
|
||||||
|
try:
|
||||||
|
img = ImageOps.exif_transpose(img)
|
||||||
|
except Exception:
|
||||||
|
pass # exif_transpose requires Pillow >= 6.0
|
||||||
|
|
||||||
|
# ── Step 2: Resize to working width (1800px) ──────────────────────────
|
||||||
max_w = 1800
|
max_w = 1800
|
||||||
if img.width > max_w:
|
if img.width > max_w:
|
||||||
scale = max_w / img.width
|
scale = max_w / img.width
|
||||||
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
|
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
|
||||||
|
# Upscale very small images — Tesseract accuracy drops below ~600px
|
||||||
|
elif img.width < 600:
|
||||||
|
scale = 600 / img.width
|
||||||
|
img = img.resize((600, int(img.height * scale)), Image.LANCZOS)
|
||||||
|
|
||||||
# Grayscale + adaptive binarisation + sharpen
|
# ── Step 3: Grayscale + contrast ─────────────────────────────────────
|
||||||
img = ImageOps.grayscale(img)
|
img = ImageOps.grayscale(img)
|
||||||
img = ImageOps.autocontrast(img)
|
img = ImageOps.autocontrast(img)
|
||||||
img = img.point(lambda x: 0 if x < 140 else 255)
|
|
||||||
|
# ── Step 4: Sharpen then binarize ─────────────────────────────────────
|
||||||
|
# Sharpen first so edges are crisp before thresholding.
|
||||||
|
# Threshold 160 (was 140) — gentler for faint thermal-print receipts
|
||||||
|
# where light gray text would be wiped out by the stricter threshold.
|
||||||
img = img.filter(ImageFilter.SHARPEN)
|
img = img.filter(ImageFilter.SHARPEN)
|
||||||
|
img = img.point(lambda x: 0 if x < 160 else 255)
|
||||||
|
|
||||||
# psm 1 = automatic page segmentation + OSD (handles rotated receipts).
|
# ── Step 5: OCR — try PSM modes best-suited for receipt layout ────────
|
||||||
# Fall back to psm 6 if OSD data is missing.
|
# PSM 6 = single uniform text block (best for single-column receipts)
|
||||||
try:
|
# PSM 4 = single column, variable text sizes (wider fallback)
|
||||||
text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip()
|
# PSM 11 = sparse text — last resort for badly segmented images
|
||||||
except Exception:
|
for psm in (6, 4, 11):
|
||||||
text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip()
|
try:
|
||||||
|
text = pytesseract.image_to_string(
|
||||||
|
img, config=f'--oem 3 --psm {psm}').strip()
|
||||||
|
if len(text) >= 20:
|
||||||
|
logger.debug('Tesseract OCR %s: psm=%d %d chars', filename, psm, len(text))
|
||||||
|
return text
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
logger.debug('Tesseract OCR %s: %d chars', filename, len(text))
|
logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename)
|
||||||
return text
|
return ''
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
||||||
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
||||||
|
|||||||
@@ -458,6 +458,27 @@ class TestExtractAmount:
|
|||||||
def test_comma_in_amount(self):
|
def test_comma_in_amount(self):
|
||||||
assert _extract_amount_from_text('Grand Total: $1,234.56') == 1234.56
|
assert _extract_amount_from_text('Grand Total: $1,234.56') == 1234.56
|
||||||
|
|
||||||
|
def test_bottom_scan_garbled_total(self):
|
||||||
|
# OCR garbled "TOTAL" — bottom-scan fallback should find the amount
|
||||||
|
text = 'Burger 5.99\nFries 2.50\nT0TAL 8.49'
|
||||||
|
assert _extract_amount_from_text(text) == 8.49
|
||||||
|
|
||||||
|
def test_bottom_scan_skips_change(self):
|
||||||
|
# Should return the total (8.49), not the change (1.51)
|
||||||
|
text = 'TOTAL 8.49\nCash 10.00\nChange 1.51'
|
||||||
|
assert _extract_amount_from_text(text) == 8.49
|
||||||
|
|
||||||
|
def test_bottom_scan_amount_on_own_line(self):
|
||||||
|
# Amount printed on a separate line below the label
|
||||||
|
text = 'Items 5.00\nTax 0.50\nTotal\n5.50'
|
||||||
|
assert _extract_amount_from_text(text) == 5.50
|
||||||
|
|
||||||
|
def test_amount_due_with_usd_suffix(self):
|
||||||
|
# PDF text may include "USD" after the number — regex should still work
|
||||||
|
# via the bottom scan since the labeled-total regex won't match "USD"
|
||||||
|
text = 'Total Charged: $198.40 USD'
|
||||||
|
assert _extract_amount_from_text(text) == 198.40
|
||||||
|
|
||||||
|
|
||||||
class TestExtractDate:
|
class TestExtractDate:
|
||||||
def test_iso_format(self):
|
def test_iso_format(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user