diff --git a/Dockerfile b/Dockerfile index 65f191f..a1e8000 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,7 @@ WORKDIR /app RUN apt-get update && apt-get install -y --no-install-recommends \ gcc libpq-dev \ tesseract-ocr \ + tesseract-ocr-osd \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt . diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 07f6931..1970d22 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -220,11 +220,14 @@ class ExpensesAgent(BaseAgent): prompt = ( 'Extract expense details from the following receipt text. ' 'Return ONLY valid JSON with these keys:\n' - '"vendor" (string, merchant name),\n' - '"amount" (number, the total amount charged — look for "Total", "Amount Due", "Grand Total"),\n' - f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found),\n' + '"vendor" (string, merchant or restaurant name),\n' + '"amount" (number — the FINAL total the customer paid; ' + 'this is labeled "Total", "Amount Due", "Grand Total", or the last dollar figure; ' + 'do NOT use subtotal, tax, or tip separately; ' + 'if multiple totals appear pick the largest one labeled as the final total),\n' + f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n' f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n' - f'Receipt text (first 2000 chars):\n{text[:2000]}\n\nJSON only:' + f'Receipt text:\n{text[:2000]}\n\nJSON only:' ) try: resp = await self._llm.submit( diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index 5eaff6b..d1e5cad 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -81,10 +81,33 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]: def _ocr_image(data: bytes, filename: str) -> str: try: - from PIL import Image + from PIL import Image, ImageFilter, ImageOps import pytesseract img = Image.open(io.BytesIO(data)) - return pytesseract.image_to_string(img).strip() + + # Resize very large images — tesseract is slower and less accurate at + # phone-camera resolution; 1800px wide is plenty for receipt text. + max_w = 1800 + if img.width > max_w: + scale = max_w / img.width + img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS) + + # Convert to grayscale and sharpen — improves OCR on thermal receipts + img = ImageOps.grayscale(img) + img = img.filter(ImageFilter.SHARPEN) + + # Let Tesseract detect orientation (OSD) and use LSTM engine. + # psm 1 = automatic + orientation detection so rotated/sideways receipts + # are handled correctly. Fall back to psm 6 if OSD fails. + config_osd = '--oem 3 --psm 1' + config_block = '--oem 3 --psm 6' + try: + text = pytesseract.image_to_string(img, config=config_osd).strip() + except Exception: + text = pytesseract.image_to_string(img, config=config_block).strip() + + logger.debug('OCR %s: %d chars extracted', filename, len(text)) + return text except ImportError: logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename) return f'[Image: {filename} — install pytesseract+Pillow for OCR]'