From 5b924e60deb1cdd9706f5f691df0cae6b1be691a Mon Sep 17 00:00:00 2001
From: Carlos Garcia <tocmo@DESKTOP-O6U0UOS.AVC.local>
Date: Sat, 16 May 2026 18:43:21 -0400
Subject: [PATCH] Add vision OCR via Ollama vision model with Tesseract
 fallback

Introduces VISION_OCR_MODEL setting. When set (e.g. llama3.2-vision:11b),
receipt images are transcribed by the Ollama vision model before falling
back to Tesseract. Also improves Tesseract preprocessing with adaptive
binarisation (autocontrast + threshold at 140) for better accuracy on
thermal receipts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 agent_service/config.py               |  4 ++
 agent_service/tools/receipt_parser.py | 69 +++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/agent_service/config.py b/agent_service/config.py
index a06e463..6428af1 100644
--- a/agent_service/config.py
+++ b/agent_service/config.py
@@ -16,6 +16,10 @@ class Settings(BaseSettings):
     ollama_model: str = 'activeblue-chat'
     ollama_timeout: int = 120
     ollama_max_concurrent: int = 2
+    # Set to a vision-capable model (e.g. llama3.2-vision:11b) to use
+    # vision OCR for receipt images instead of Tesseract.  Leave empty
+    # to keep the Tesseract pipeline.
+    vision_ocr_model: str = ''
 
     # Anthropic / Claude
     anthropic_api_key: str = ''
diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py
index d1e5cad..7f42966 100644
--- a/agent_service/tools/receipt_parser.py
+++ b/agent_service/tools/receipt_parser.py
@@ -80,6 +80,56 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
 
 
 def _ocr_image(data: bytes, filename: str) -> str:
+    """Extract text from a receipt image.
+
+    Tries vision-model OCR first when VISION_OCR_MODEL is configured,
+    then falls back to the Tesseract pipeline.
+    """
+    from agent_service.config import get_settings
+    settings = get_settings()
+    if settings.vision_ocr_model:
+        result = _ocr_image_vision(data, filename,
+                                   settings.ollama_url,
+                                   settings.vision_ocr_model)
+        if result:
+            return result
+        logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
+    return _ocr_image_tesseract(data, filename)
+
+
+def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
+    """Use an Ollama vision model to read a receipt image."""
+    try:
+        import ollama as _ollama
+        client = _ollama.Client(host=ollama_url)
+        response = client.chat(
+            model=model,
+            messages=[{
+                'role': 'user',
+                'content': (
+                    'This is a photo of a paper receipt. '
+                    'Transcribe ALL text exactly as it appears on the receipt. '
+                    'Preserve every line in order: store name, address, date, time, '
+                    'each line item with price, subtotal, tax, tip if present, and '
+                    'the final total. Output the raw text only — no commentary, '
+                    'no markdown, no explanations.'
+                ),
+                'images': [data],
+            }],
+        )
+        text = (response.message.content or '').strip()
+        logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text))
+        return text
+    except ImportError:
+        logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
+        return ''
+    except Exception as exc:
+        logger.warning('Vision OCR failed for %s: %s', filename, exc)
+        return ''
+
+
+def _ocr_image_tesseract(data: bytes, filename: str) -> str:
+    """Tesseract-based OCR pipeline (fallback)."""
     try:
         from PIL import Image, ImageFilter, ImageOps
         import pytesseract
@@ -92,27 +142,26 @@ def _ocr_image(data: bytes, filename: str) -> str:
             scale = max_w / img.width
             img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
 
-        # Convert to grayscale and sharpen — improves OCR on thermal receipts
+        # Grayscale + adaptive binarisation + sharpen
         img = ImageOps.grayscale(img)
+        img = ImageOps.autocontrast(img)
+        img = img.point(lambda x: 0 if x < 140 else 255)
         img = img.filter(ImageFilter.SHARPEN)
 
-        # Let Tesseract detect orientation (OSD) and use LSTM engine.
-        # psm 1 = automatic + orientation detection so rotated/sideways receipts
-        # are handled correctly. Fall back to psm 6 if OSD fails.
-        config_osd = '--oem 3 --psm 1'
-        config_block = '--oem 3 --psm 6'
+        # psm 1 = automatic page segmentation + OSD (handles rotated receipts).
+        # Fall back to psm 6 if OSD data is missing.
         try:
-            text = pytesseract.image_to_string(img, config=config_osd).strip()
+            text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip()
         except Exception:
-            text = pytesseract.image_to_string(img, config=config_block).strip()
+            text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip()
 
-        logger.debug('OCR %s: %d chars extracted', filename, len(text))
+        logger.debug('Tesseract OCR %s: %d chars', filename, len(text))
         return text
     except ImportError:
         logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
         return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
     except Exception as exc:
-        logger.warning('OCR failed for %s: %s', filename, exc)
+        logger.warning('Tesseract OCR failed for %s: %s', filename, exc)
         return f'[Image: {filename} — OCR failed: {exc}]'