Add vision OCR via Ollama vision model with Tesseract fallback

Introduces VISION_OCR_MODEL setting. When set (e.g. llama3.2-vision:11b),
receipt images are transcribed by the Ollama vision model before falling
back to Tesseract. Also improves Tesseract preprocessing with adaptive
binarisation (autocontrast + threshold at 140) for better accuracy on
thermal receipts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-16 18:43:21 -04:00
parent 9f38fb013c
commit 5b924e60de
2 changed files with 63 additions and 10 deletions

View File

@@ -16,6 +16,10 @@ class Settings(BaseSettings):
ollama_model: str = 'activeblue-chat'
ollama_timeout: int = 120
ollama_max_concurrent: int = 2
# Set to a vision-capable model (e.g. llama3.2-vision:11b) to use
# vision OCR for receipt images instead of Tesseract. Leave empty
# to keep the Tesseract pipeline.
vision_ocr_model: str = ''
# Anthropic / Claude
anthropic_api_key: str = ''

View File

@@ -80,6 +80,56 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
def _ocr_image(data: bytes, filename: str) -> str:
"""Extract text from a receipt image.
Tries vision-model OCR first when VISION_OCR_MODEL is configured,
then falls back to the Tesseract pipeline.
"""
from agent_service.config import get_settings
settings = get_settings()
if settings.vision_ocr_model:
result = _ocr_image_vision(data, filename,
settings.ollama_url,
settings.vision_ocr_model)
if result:
return result
logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
return _ocr_image_tesseract(data, filename)
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
"""Use an Ollama vision model to read a receipt image."""
try:
import ollama as _ollama
client = _ollama.Client(host=ollama_url)
response = client.chat(
model=model,
messages=[{
'role': 'user',
'content': (
'This is a photo of a paper receipt. '
'Transcribe ALL text exactly as it appears on the receipt. '
'Preserve every line in order: store name, address, date, time, '
'each line item with price, subtotal, tax, tip if present, and '
'the final total. Output the raw text only — no commentary, '
'no markdown, no explanations.'
),
'images': [data],
}],
)
text = (response.message.content or '').strip()
logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text))
return text
except ImportError:
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
return ''
except Exception as exc:
logger.warning('Vision OCR failed for %s: %s', filename, exc)
return ''
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
"""Tesseract-based OCR pipeline (fallback)."""
try:
from PIL import Image, ImageFilter, ImageOps
import pytesseract
@@ -92,27 +142,26 @@ def _ocr_image(data: bytes, filename: str) -> str:
scale = max_w / img.width
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
# Convert to grayscale and sharpen — improves OCR on thermal receipts
# Grayscale + adaptive binarisation + sharpen
img = ImageOps.grayscale(img)
img = ImageOps.autocontrast(img)
img = img.point(lambda x: 0 if x < 140 else 255)
img = img.filter(ImageFilter.SHARPEN)
# Let Tesseract detect orientation (OSD) and use LSTM engine.
# psm 1 = automatic + orientation detection so rotated/sideways receipts
# are handled correctly. Fall back to psm 6 if OSD fails.
config_osd = '--oem 3 --psm 1'
config_block = '--oem 3 --psm 6'
# psm 1 = automatic page segmentation + OSD (handles rotated receipts).
# Fall back to psm 6 if OSD data is missing.
try:
text = pytesseract.image_to_string(img, config=config_osd).strip()
text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip()
except Exception:
text = pytesseract.image_to_string(img, config=config_block).strip()
text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip()
logger.debug('OCR %s: %d chars extracted', filename, len(text))
logger.debug('Tesseract OCR %s: %d chars', filename, len(text))
return text
except ImportError:
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
except Exception as exc:
logger.warning('OCR failed for %s: %s', filename, exc)
logger.warning('Tesseract OCR failed for %s: %s', filename, exc)
return f'[Image: {filename} — OCR failed: {exc}]'