From 5b924e60deb1cdd9706f5f691df0cae6b1be691a Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Sat, 16 May 2026 18:43:21 -0400 Subject: [PATCH] Add vision OCR via Ollama vision model with Tesseract fallback Introduces VISION_OCR_MODEL setting. When set (e.g. llama3.2-vision:11b), receipt images are transcribed by the Ollama vision model before falling back to Tesseract. Also improves Tesseract preprocessing with adaptive binarisation (autocontrast + threshold at 140) for better accuracy on thermal receipts. Co-Authored-By: Claude Sonnet 4.6 --- agent_service/config.py | 4 ++ agent_service/tools/receipt_parser.py | 69 +++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/agent_service/config.py b/agent_service/config.py index a06e463..6428af1 100644 --- a/agent_service/config.py +++ b/agent_service/config.py @@ -16,6 +16,10 @@ class Settings(BaseSettings): ollama_model: str = 'activeblue-chat' ollama_timeout: int = 120 ollama_max_concurrent: int = 2 + # Set to a vision-capable model (e.g. llama3.2-vision:11b) to use + # vision OCR for receipt images instead of Tesseract. Leave empty + # to keep the Tesseract pipeline. + vision_ocr_model: str = '' # Anthropic / Claude anthropic_api_key: str = '' diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index d1e5cad..7f42966 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -80,6 +80,56 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]: def _ocr_image(data: bytes, filename: str) -> str: + """Extract text from a receipt image. + + Tries vision-model OCR first when VISION_OCR_MODEL is configured, + then falls back to the Tesseract pipeline. + """ + from agent_service.config import get_settings + settings = get_settings() + if settings.vision_ocr_model: + result = _ocr_image_vision(data, filename, + settings.ollama_url, + settings.vision_ocr_model) + if result: + return result + logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename) + return _ocr_image_tesseract(data, filename) + + +def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str: + """Use an Ollama vision model to read a receipt image.""" + try: + import ollama as _ollama + client = _ollama.Client(host=ollama_url) + response = client.chat( + model=model, + messages=[{ + 'role': 'user', + 'content': ( + 'This is a photo of a paper receipt. ' + 'Transcribe ALL text exactly as it appears on the receipt. ' + 'Preserve every line in order: store name, address, date, time, ' + 'each line item with price, subtotal, tax, tip if present, and ' + 'the final total. Output the raw text only — no commentary, ' + 'no markdown, no explanations.' + ), + 'images': [data], + }], + ) + text = (response.message.content or '').strip() + logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text)) + return text + except ImportError: + logger.warning('ollama package not installed — vision OCR unavailable for %s', filename) + return '' + except Exception as exc: + logger.warning('Vision OCR failed for %s: %s', filename, exc) + return '' + + +def _ocr_image_tesseract(data: bytes, filename: str) -> str: + """Tesseract-based OCR pipeline (fallback).""" try: from PIL import Image, ImageFilter, ImageOps import pytesseract @@ -92,27 +142,26 @@ def _ocr_image(data: bytes, filename: str) -> str: scale = max_w / img.width img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS) - # Convert to grayscale and sharpen — improves OCR on thermal receipts + # Grayscale + adaptive binarisation + sharpen img = ImageOps.grayscale(img) + img = ImageOps.autocontrast(img) + img = img.point(lambda x: 0 if x < 140 else 255) img = img.filter(ImageFilter.SHARPEN) - # Let Tesseract detect orientation (OSD) and use LSTM engine. - # psm 1 = automatic + orientation detection so rotated/sideways receipts - # are handled correctly. Fall back to psm 6 if OSD fails. - config_osd = '--oem 3 --psm 1' - config_block = '--oem 3 --psm 6' + # psm 1 = automatic page segmentation + OSD (handles rotated receipts). + # Fall back to psm 6 if OSD data is missing. try: - text = pytesseract.image_to_string(img, config=config_osd).strip() + text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip() except Exception: - text = pytesseract.image_to_string(img, config=config_block).strip() + text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip() - logger.debug('OCR %s: %d chars extracted', filename, len(text)) + logger.debug('Tesseract OCR %s: %d chars', filename, len(text)) return text except ImportError: logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename) return f'[Image: {filename} — install pytesseract+Pillow for OCR]' except Exception as exc: - logger.warning('OCR failed for %s: %s', filename, exc) + logger.warning('Tesseract OCR failed for %s: %s', filename, exc) return f'[Image: {filename} — OCR failed: {exc}]'