Add vision OCR via Ollama vision model with Tesseract fallback
Introduces VISION_OCR_MODEL setting. When set (e.g. llama3.2-vision:11b), receipt images are transcribed by the Ollama vision model before falling back to Tesseract. Also improves Tesseract preprocessing with adaptive binarisation (autocontrast + threshold at 140) for better accuracy on thermal receipts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,10 @@ class Settings(BaseSettings):
|
||||
ollama_model: str = 'activeblue-chat'
|
||||
ollama_timeout: int = 120
|
||||
ollama_max_concurrent: int = 2
|
||||
# Set to a vision-capable model (e.g. llama3.2-vision:11b) to use
|
||||
# vision OCR for receipt images instead of Tesseract. Leave empty
|
||||
# to keep the Tesseract pipeline.
|
||||
vision_ocr_model: str = ''
|
||||
|
||||
# Anthropic / Claude
|
||||
anthropic_api_key: str = ''
|
||||
|
||||
@@ -80,6 +80,56 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
||||
|
||||
|
||||
def _ocr_image(data: bytes, filename: str) -> str:
|
||||
"""Extract text from a receipt image.
|
||||
|
||||
Tries vision-model OCR first when VISION_OCR_MODEL is configured,
|
||||
then falls back to the Tesseract pipeline.
|
||||
"""
|
||||
from agent_service.config import get_settings
|
||||
settings = get_settings()
|
||||
if settings.vision_ocr_model:
|
||||
result = _ocr_image_vision(data, filename,
|
||||
settings.ollama_url,
|
||||
settings.vision_ocr_model)
|
||||
if result:
|
||||
return result
|
||||
logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
|
||||
return _ocr_image_tesseract(data, filename)
|
||||
|
||||
|
||||
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
|
||||
"""Use an Ollama vision model to read a receipt image."""
|
||||
try:
|
||||
import ollama as _ollama
|
||||
client = _ollama.Client(host=ollama_url)
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[{
|
||||
'role': 'user',
|
||||
'content': (
|
||||
'This is a photo of a paper receipt. '
|
||||
'Transcribe ALL text exactly as it appears on the receipt. '
|
||||
'Preserve every line in order: store name, address, date, time, '
|
||||
'each line item with price, subtotal, tax, tip if present, and '
|
||||
'the final total. Output the raw text only — no commentary, '
|
||||
'no markdown, no explanations.'
|
||||
),
|
||||
'images': [data],
|
||||
}],
|
||||
)
|
||||
text = (response.message.content or '').strip()
|
||||
logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text))
|
||||
return text
|
||||
except ImportError:
|
||||
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
|
||||
return ''
|
||||
except Exception as exc:
|
||||
logger.warning('Vision OCR failed for %s: %s', filename, exc)
|
||||
return ''
|
||||
|
||||
|
||||
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||
"""Tesseract-based OCR pipeline (fallback)."""
|
||||
try:
|
||||
from PIL import Image, ImageFilter, ImageOps
|
||||
import pytesseract
|
||||
@@ -92,27 +142,26 @@ def _ocr_image(data: bytes, filename: str) -> str:
|
||||
scale = max_w / img.width
|
||||
img = img.resize((max_w, int(img.height * scale)), Image.LANCZOS)
|
||||
|
||||
# Convert to grayscale and sharpen — improves OCR on thermal receipts
|
||||
# Grayscale + adaptive binarisation + sharpen
|
||||
img = ImageOps.grayscale(img)
|
||||
img = ImageOps.autocontrast(img)
|
||||
img = img.point(lambda x: 0 if x < 140 else 255)
|
||||
img = img.filter(ImageFilter.SHARPEN)
|
||||
|
||||
# Let Tesseract detect orientation (OSD) and use LSTM engine.
|
||||
# psm 1 = automatic + orientation detection so rotated/sideways receipts
|
||||
# are handled correctly. Fall back to psm 6 if OSD fails.
|
||||
config_osd = '--oem 3 --psm 1'
|
||||
config_block = '--oem 3 --psm 6'
|
||||
# psm 1 = automatic page segmentation + OSD (handles rotated receipts).
|
||||
# Fall back to psm 6 if OSD data is missing.
|
||||
try:
|
||||
text = pytesseract.image_to_string(img, config=config_osd).strip()
|
||||
text = pytesseract.image_to_string(img, config='--oem 3 --psm 1').strip()
|
||||
except Exception:
|
||||
text = pytesseract.image_to_string(img, config=config_block).strip()
|
||||
text = pytesseract.image_to_string(img, config='--oem 3 --psm 6').strip()
|
||||
|
||||
logger.debug('OCR %s: %d chars extracted', filename, len(text))
|
||||
logger.debug('Tesseract OCR %s: %d chars', filename, len(text))
|
||||
return text
|
||||
except ImportError:
|
||||
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
||||
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
||||
except Exception as exc:
|
||||
logger.warning('OCR failed for %s: %s', filename, exc)
|
||||
logger.warning('Tesseract OCR failed for %s: %s', filename, exc)
|
||||
return f'[Image: {filename} — OCR failed: {exc}]'
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user