Remove vision OCR — use Tesseract-only pipeline for receipt parsing

The llama3.2-vision model was producing unreliable structured data
(wrong vendors, amounts, dates) making expense reports worse than
Tesseract + LLM extraction.  Removes _ocr_image_vision(), the
vision JSON fast path in _parse_receipt_text(), _match_category(),
and the vision_ocr_model config setting entirely.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-20 22:32:26 -04:00
parent ec6b41943f
commit 0320591344
4 changed files with 4 additions and 247 deletions

View File

@@ -80,121 +80,10 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
def _ocr_image(data: bytes, filename: str) -> str:
"""Extract text from a receipt image.
Tries vision-model OCR first when VISION_OCR_MODEL is configured,
then falls back to the Tesseract pipeline.
"""
from agent_service.config import get_settings
settings = get_settings()
if settings.vision_ocr_model:
result = _ocr_image_vision(data, filename,
settings.ollama_url,
settings.vision_ocr_model)
if result:
return result
logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
"""Extract text from a receipt image using Tesseract."""
return _ocr_image_tesseract(data, filename)
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
"""Use an Ollama vision model to extract receipt data directly as JSON.
Returns a JSON string {vendor, amount, date, time, category} so the
expenses agent can skip the second LLM extraction step entirely.
Returns empty string on any failure so the caller falls back to Tesseract.
"""
import json as _json
import re as _re
def _repair_json(s: str) -> str:
"""Fix the most common LLM JSON formatting mistakes.
Handles:
- trailing commas before } or ] → {"a":1,} becomes {"a":1}
- single-quoted strings → {'a':'b'} becomes {"a":"b"}
- unquoted string keys → {a: "b"} becomes {"a": "b"}
"""
# trailing commas
s = _re.sub(r',\s*([}\]])', r'\1', s)
# single-quoted strings (careful around apostrophes in values)
s = _re.sub(r"'([^']*)'", r'"\1"', s)
# unquoted keys: word characters before a colon
s = _re.sub(r'(?<!["\w])(\w+)\s*:', r'"\1":', s)
return s
try:
import ollama as _ollama
client = _ollama.Client(host=ollama_url)
response = client.chat(
model=model,
format='json', # Ollama JSON mode — forces syntactically valid output
messages=[{
'role': 'user',
'content': (
'You are a receipt data extractor. '
'Read this receipt image and extract the following fields. '
'Copy values EXACTLY as printed — do NOT guess, infer, or '
'invent values you cannot clearly see.\n\n'
'Fields to extract:\n'
'- vendor: the store or restaurant name exactly as printed; '
'empty string if not clearly visible\n'
'- amount: the FINAL total the customer paid; find a line '
'labeled "Total", "Grand Total", "Amount Due", or "Balance Due"; '
'copy the number exactly; do NOT use subtotal, tax, or tip; '
'return 0 if no clearly labeled final total is visible\n'
'- date: transaction date in YYYY-MM-DD format; '
'null if not clearly visible\n'
'- time: transaction time in HH:MM 24-hour format; '
'null if not clearly visible\n'
'- category: one of: meals, fuel, hotel, office, transport, other\n\n'
'Return ONLY a valid JSON object, no commentary, no markdown:\n'
'{"vendor":"...","amount":0.00,"date":"YYYY-MM-DD or null",'
'"time":"HH:MM or null","category":"..."}'
),
'images': [data],
}],
)
if isinstance(response, dict):
raw = (response.get('message', {}).get('content') or '').strip()
else:
raw = (response.message.content or '').strip()
# Must contain a JSON object, not prose
first, last = raw.find('{'), raw.rfind('}')
if first == -1 or last <= first:
logger.warning('Vision OCR %s: model returned prose, falling back to Tesseract',
filename)
return ''
json_str = raw[first:last + 1]
# Parse — on failure attempt common repairs then retry once
try:
parsed = _json.loads(json_str)
except _json.JSONDecodeError as json_err:
repaired = _repair_json(json_str)
try:
parsed = _json.loads(repaired)
logger.debug('Vision OCR %s: JSON repaired successfully', filename)
except _json.JSONDecodeError:
logger.warning('Vision OCR %s: JSON parse failed (%s), falling back',
filename, json_err)
return ''
if 'amount' not in parsed:
logger.warning('Vision OCR %s: JSON missing amount field, falling back', filename)
return ''
logger.debug('Vision OCR %s (%s): extracted JSON ok', filename, model)
# Re-serialise so downstream always gets clean, canonical JSON
return _json.dumps(parsed)
except ImportError:
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
return ''
except Exception as exc:
logger.warning('Vision OCR failed for %s: %s', filename, exc)
return ''
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
"""Tesseract-based OCR pipeline (fallback)."""
try: