Add vision OCR via Ollama vision model with Tesseract fallback

Introduces VISION_OCR_MODEL setting. When set (e.g. llama3.2-vision:11b), receipt images are transcribed by the Ollama vision model before falling back to Tesseract. Also improves Tesseract preprocessing with adaptive binarisation (autocontrast + threshold at 140) for better accuracy on thermal receipts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 18:43:21 -04:00
parent 9f38fb013c
commit 5b924e60de
2 changed files with 63 additions and 10 deletions
--- a/agent_service/config.py
+++ b/agent_service/config.py
@@ -16,6 +16,10 @@ class Settings(BaseSettings):
    ollama_model: str = 'activeblue-chat'
    ollama_timeout: int = 120
    ollama_max_concurrent: int = 2
+    # Set to a vision-capable model (e.g. llama3.2-vision:11b) to use
+    # vision OCR for receipt images instead of Tesseract.  Leave empty
+    # to keep the Tesseract pipeline.
+    vision_ocr_model: str = ''

    # Anthropic / Claude
    anthropic_api_key: str = ''
--- a/agent_service/tools/receipt_parser.py
+++ b/agent_service/tools/receipt_parser.py
@@ -80,6 +80,56 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:


 def _ocr_image(data: bytes, filename: str) -> str:
+    """Extract text from a receipt image.
+
+    Tries vision-model OCR first when VISION_OCR_MODEL is configured,
+    then falls back to the Tesseract pipeline.
+    """
+    from agent_service.config import get_settings
+    settings = get_settings()
+    if settings.vision_ocr_model:
+        result = _ocr_image_vision(data, filename,
+                                   settings.ollama_url,
+                                   settings.vision_ocr_model)
+        if result:
+            return result
+        logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
+    return _ocr_image_tesseract(data, filename)
+
+
+def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
+    """Use an Ollama vision model to read a receipt image."""
+    try:
+        import ollama as _ollama
+        client = _ollama.Client(host=ollama_url)
+        response = client.chat(
+            model=model,
+            messages=[{
+                'role': 'user',
+                'content': (
+                    'This is a photo of a paper receipt. '
+                    'Transcribe ALL text exactly as it appears on the receipt. '
+                    'Preserve every line in order: store name, address, date, time, '
+                    'each line item with price, subtotal, tax, tip if present, and '
+                    'the final total. Output the raw text only — no commentary, '
+                    'no markdown, no explanations.'
+                ),
+                'images': [data],
+            }],
+        )
+        text = (response.message.content or '').strip()
+        logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text))
+        return text
+    except ImportError:
+        logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
+        return ''
+    except Exception as exc:
+        logger.warning('Vision OCR failed for %s: %s', filename, exc)
+        return ''
+
+
+def _ocr_image_tesseract(data: bytes, filename: str) -> str:
+    """Tesseract-based OCR pipeline (fallback)."""
    try:
        from PIL import Image, ImageFilter, ImageOps
        import pytesseract
@@ -92,27 +142,26 @@ def _ocr_image(data: bytes, filename: str) -> str:
            scale = max_w / img.width
            img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)

-        # Convert to grayscale and sharpen — improves OCR on thermal receipts
+        # Grayscale + adaptive binarisation + sharpen
        img = ImageOps.grayscale(img)
+        img = ImageOps.autocontrast(img)
+        img = img.point(lambda x: 0 if x < 140 else 255)
        img = img.filter(ImageFilter.SHARPEN)

-        # Let Tesseract detect orientation (OSD) and use LSTM engine.
-        # psm 1 = automatic + orientation detection so rotated/sideways receipts
-        # are handled correctly. Fall back to psm 6 if OSD fails.
-        config_osd = '--oem 3 --psm 1'
-        config_block = '--oem 3 --psm 6'
+        # psm 1 = automatic page segmentation + OSD (handles rotated receipts).
+        # Fall back to psm 6 if OSD data is missing.
        try:
-            text = pytesseract.image_to_string(img, config=config_osd).strip()
+            text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip()
        except Exception:
-            text = pytesseract.image_to_string(img, config=config_block).strip()
+            text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip()

-        logger.debug('OCR %s: %d chars extracted', filename, len(text))
+        logger.debug('Tesseract OCR %s: %d chars', filename, len(text))
        return text
    except ImportError:
        logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
        return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
    except Exception as exc:
-        logger.warning('OCR failed for %s: %s', filename, exc)
+        logger.warning('Tesseract OCR failed for %s: %s', filename, exc)
        return f'[Image: {filename} — OCR failed: {exc}]'