fix: improve OCR accuracy for rotated/sideways receipt photos
- Dockerfile: add tesseract-ocr-osd for orientation detection data - receipt_parser: resize large phone photos to 1800px, convert to grayscale, sharpen before OCR; use psm 1 (auto + OSD) so rotated receipts are correctly oriented before text extraction - expenses_agent: tighten amount extraction prompt to pick the FINAL total, not subtotal or tax line, reducing misreads like 42.90->409.00 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ WORKDIR /app
|
|||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
gcc libpq-dev \
|
gcc libpq-dev \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-osd \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
|||||||
@@ -220,11 +220,14 @@ class ExpensesAgent(BaseAgent):
|
|||||||
prompt = (
|
prompt = (
|
||||||
'Extract expense details from the following receipt text. '
|
'Extract expense details from the following receipt text. '
|
||||||
'Return ONLY valid JSON with these keys:\n'
|
'Return ONLY valid JSON with these keys:\n'
|
||||||
'"vendor" (string, merchant name),\n'
|
'"vendor" (string, merchant or restaurant name),\n'
|
||||||
'"amount" (number, the total amount charged — look for "Total", "Amount Due", "Grand Total"),\n'
|
'"amount" (number — the FINAL total the customer paid; '
|
||||||
f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found),\n'
|
'this is labeled "Total", "Amount Due", "Grand Total", or the last dollar figure; '
|
||||||
|
'do NOT use subtotal, tax, or tip separately; '
|
||||||
|
'if multiple totals appear pick the largest one labeled as the final total),\n'
|
||||||
|
f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n'
|
||||||
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
|
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
|
||||||
f'Receipt text (first 2000 chars):\n{text[:2000]}\n\nJSON only:'
|
f'Receipt text:\n{text[:2000]}\n\nJSON only:'
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
resp = await self._llm.submit(
|
resp = await self._llm.submit(
|
||||||
|
|||||||
@@ -81,10 +81,33 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
|||||||
|
|
||||||
def _ocr_image(data: bytes, filename: str) -> str:
|
def _ocr_image(data: bytes, filename: str) -> str:
|
||||||
try:
|
try:
|
||||||
from PIL import Image
|
from PIL import Image, ImageFilter, ImageOps
|
||||||
import pytesseract
|
import pytesseract
|
||||||
img = Image.open(io.BytesIO(data))
|
img = Image.open(io.BytesIO(data))
|
||||||
return pytesseract.image_to_string(img).strip()
|
|
||||||
|
# Resize very large images — tesseract is slower and less accurate at
|
||||||
|
# phone-camera resolution; 1800px wide is plenty for receipt text.
|
||||||
|
max_w = 1800
|
||||||
|
if img.width > max_w:
|
||||||
|
scale = max_w / img.width
|
||||||
|
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
|
||||||
|
|
||||||
|
# Convert to grayscale and sharpen — improves OCR on thermal receipts
|
||||||
|
img = ImageOps.grayscale(img)
|
||||||
|
img = img.filter(ImageFilter.SHARPEN)
|
||||||
|
|
||||||
|
# Let Tesseract detect orientation (OSD) and use LSTM engine.
|
||||||
|
# psm 1 = automatic + orientation detection so rotated/sideways receipts
|
||||||
|
# are handled correctly. Fall back to psm 6 if OSD fails.
|
||||||
|
config_osd = '--oem 3 --psm 1'
|
||||||
|
config_block = '--oem 3 --psm 6'
|
||||||
|
try:
|
||||||
|
text = pytesseract.image_to_string(img, config=config_osd).strip()
|
||||||
|
except Exception:
|
||||||
|
text = pytesseract.image_to_string(img, config=config_block).strip()
|
||||||
|
|
||||||
|
logger.debug('OCR %s: %d chars extracted', filename, len(text))
|
||||||
|
return text
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
||||||
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
||||||
|
|||||||
Reference in New Issue
Block a user