Remove vision OCR — use Tesseract-only pipeline for receipt parsing
The llama3.2-vision model was producing unreliable structured data (wrong vendors, amounts, dates) making expense reports worse than Tesseract + LLM extraction. Removes _ocr_image_vision(), the vision JSON fast path in _parse_receipt_text(), _match_category(), and the vision_ocr_model config setting entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -80,121 +80,10 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
||||
|
||||
|
||||
def _ocr_image(data: bytes, filename: str) -> str:
|
||||
"""Extract text from a receipt image.
|
||||
|
||||
Tries vision-model OCR first when VISION_OCR_MODEL is configured,
|
||||
then falls back to the Tesseract pipeline.
|
||||
"""
|
||||
from agent_service.config import get_settings
|
||||
settings = get_settings()
|
||||
if settings.vision_ocr_model:
|
||||
result = _ocr_image_vision(data, filename,
|
||||
settings.ollama_url,
|
||||
settings.vision_ocr_model)
|
||||
if result:
|
||||
return result
|
||||
logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
|
||||
"""Extract text from a receipt image using Tesseract."""
|
||||
return _ocr_image_tesseract(data, filename)
|
||||
|
||||
|
||||
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
|
||||
"""Use an Ollama vision model to extract receipt data directly as JSON.
|
||||
|
||||
Returns a JSON string {vendor, amount, date, time, category} so the
|
||||
expenses agent can skip the second LLM extraction step entirely.
|
||||
Returns empty string on any failure so the caller falls back to Tesseract.
|
||||
"""
|
||||
import json as _json
|
||||
import re as _re
|
||||
|
||||
def _repair_json(s: str) -> str:
|
||||
"""Fix the most common LLM JSON formatting mistakes.
|
||||
|
||||
Handles:
|
||||
- trailing commas before } or ] → {"a":1,} becomes {"a":1}
|
||||
- single-quoted strings → {'a':'b'} becomes {"a":"b"}
|
||||
- unquoted string keys → {a: "b"} becomes {"a": "b"}
|
||||
"""
|
||||
# trailing commas
|
||||
s = _re.sub(r',\s*([}\]])', r'\1', s)
|
||||
# single-quoted strings (careful around apostrophes in values)
|
||||
s = _re.sub(r"'([^']*)'", r'"\1"', s)
|
||||
# unquoted keys: word characters before a colon
|
||||
s = _re.sub(r'(?<!["\w])(\w+)\s*:', r'"\1":', s)
|
||||
return s
|
||||
|
||||
try:
|
||||
import ollama as _ollama
|
||||
client = _ollama.Client(host=ollama_url)
|
||||
response = client.chat(
|
||||
model=model,
|
||||
format='json', # Ollama JSON mode — forces syntactically valid output
|
||||
messages=[{
|
||||
'role': 'user',
|
||||
'content': (
|
||||
'You are a receipt data extractor. '
|
||||
'Read this receipt image and extract the following fields. '
|
||||
'Copy values EXACTLY as printed — do NOT guess, infer, or '
|
||||
'invent values you cannot clearly see.\n\n'
|
||||
'Fields to extract:\n'
|
||||
'- vendor: the store or restaurant name exactly as printed; '
|
||||
'empty string if not clearly visible\n'
|
||||
'- amount: the FINAL total the customer paid; find a line '
|
||||
'labeled "Total", "Grand Total", "Amount Due", or "Balance Due"; '
|
||||
'copy the number exactly; do NOT use subtotal, tax, or tip; '
|
||||
'return 0 if no clearly labeled final total is visible\n'
|
||||
'- date: transaction date in YYYY-MM-DD format; '
|
||||
'null if not clearly visible\n'
|
||||
'- time: transaction time in HH:MM 24-hour format; '
|
||||
'null if not clearly visible\n'
|
||||
'- category: one of: meals, fuel, hotel, office, transport, other\n\n'
|
||||
'Return ONLY a valid JSON object, no commentary, no markdown:\n'
|
||||
'{"vendor":"...","amount":0.00,"date":"YYYY-MM-DD or null",'
|
||||
'"time":"HH:MM or null","category":"..."}'
|
||||
),
|
||||
'images': [data],
|
||||
}],
|
||||
)
|
||||
if isinstance(response, dict):
|
||||
raw = (response.get('message', {}).get('content') or '').strip()
|
||||
else:
|
||||
raw = (response.message.content or '').strip()
|
||||
|
||||
# Must contain a JSON object, not prose
|
||||
first, last = raw.find('{'), raw.rfind('}')
|
||||
if first == -1 or last <= first:
|
||||
logger.warning('Vision OCR %s: model returned prose, falling back to Tesseract',
|
||||
filename)
|
||||
return ''
|
||||
json_str = raw[first:last + 1]
|
||||
|
||||
# Parse — on failure attempt common repairs then retry once
|
||||
try:
|
||||
parsed = _json.loads(json_str)
|
||||
except _json.JSONDecodeError as json_err:
|
||||
repaired = _repair_json(json_str)
|
||||
try:
|
||||
parsed = _json.loads(repaired)
|
||||
logger.debug('Vision OCR %s: JSON repaired successfully', filename)
|
||||
except _json.JSONDecodeError:
|
||||
logger.warning('Vision OCR %s: JSON parse failed (%s), falling back',
|
||||
filename, json_err)
|
||||
return ''
|
||||
|
||||
if 'amount' not in parsed:
|
||||
logger.warning('Vision OCR %s: JSON missing amount field, falling back', filename)
|
||||
return ''
|
||||
logger.debug('Vision OCR %s (%s): extracted JSON ok', filename, model)
|
||||
# Re-serialise so downstream always gets clean, canonical JSON
|
||||
return _json.dumps(parsed)
|
||||
except ImportError:
|
||||
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
|
||||
return ''
|
||||
except Exception as exc:
|
||||
logger.warning('Vision OCR failed for %s: %s', filename, exc)
|
||||
return ''
|
||||
|
||||
|
||||
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||
"""Tesseract-based OCR pipeline (fallback)."""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user