Remove vision OCR — use Tesseract-only pipeline for receipt parsing
The llama3.2-vision model was producing unreliable structured data (wrong vendors, amounts, dates) making expense reports worse than Tesseract + LLM extraction. Removes _ocr_image_vision(), the vision JSON fast path in _parse_receipt_text(), _match_category(), and the vision_ocr_model config setting entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -317,30 +317,6 @@ class ExpensesAgent(BaseAgent):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _match_category(category: str, expense_products: list) -> str:
|
|
||||||
"""Map a vision-model category label to the nearest expense product name.
|
|
||||||
|
|
||||||
Tries exact/substring match first, then a fuzzy SequenceMatcher pass.
|
|
||||||
Returns empty string when no reasonable match is found.
|
|
||||||
"""
|
|
||||||
if not expense_products or not category:
|
|
||||||
return ''
|
|
||||||
cat = category.lower().strip()
|
|
||||||
# Exact or substring match
|
|
||||||
for p in expense_products:
|
|
||||||
name = p['name'].lower()
|
|
||||||
if cat == name or cat in name or name in cat:
|
|
||||||
return p['name']
|
|
||||||
# Fuzzy fallback (ratio >= 0.4)
|
|
||||||
names_lower = [p['name'].lower() for p in expense_products]
|
|
||||||
matches = difflib.get_close_matches(cat, names_lower, n=1, cutoff=0.4)
|
|
||||||
if matches:
|
|
||||||
for p in expense_products:
|
|
||||||
if p['name'].lower() == matches[0]:
|
|
||||||
return p['name']
|
|
||||||
return ''
|
|
||||||
|
|
||||||
async def _parse_receipt_text(self, text: str, filename: str,
|
async def _parse_receipt_text(self, text: str, filename: str,
|
||||||
expense_products: list = None,
|
expense_products: list = None,
|
||||||
date_hint: str = None) -> dict:
|
date_hint: str = None) -> dict:
|
||||||
@@ -348,35 +324,7 @@ class ExpensesAgent(BaseAgent):
|
|||||||
fallback = {'vendor': filename, 'amount': 0.0,
|
fallback = {'vendor': filename, 'amount': 0.0,
|
||||||
'date': date_hint or today, 'time': None, 'product_name': ''}
|
'date': date_hint or today, 'time': None, 'product_name': ''}
|
||||||
|
|
||||||
# ── Fast path: vision model already returned structured JSON ──────────
|
|
||||||
# receipt_parser._ocr_image_vision() returns a JSON string directly
|
|
||||||
# when a vision model is configured. Skip the second LLM call entirely.
|
|
||||||
stripped = (text or '').strip()
|
stripped = (text or '').strip()
|
||||||
if stripped.startswith('{'):
|
|
||||||
try:
|
|
||||||
data = json.loads(stripped)
|
|
||||||
if 'amount' in data:
|
|
||||||
logger.debug('expenses_agent: using vision pre-extracted JSON for %s', filename)
|
|
||||||
# Map the vision category label → expense product name
|
|
||||||
product_name = self._match_category(
|
|
||||||
data.get('category', ''), expense_products or [])
|
|
||||||
# Vision model sometimes returns the string "null" instead
|
|
||||||
# of JSON null — normalise both fields.
|
|
||||||
_NULL = (None, 'null', 'None', '')
|
|
||||||
raw_time = data.get('time')
|
|
||||||
time_val = None if raw_time in _NULL else str(raw_time)
|
|
||||||
raw_date = data.get('date')
|
|
||||||
date_val = None if raw_date in _NULL else str(raw_date)
|
|
||||||
return {
|
|
||||||
'vendor': str(data.get('vendor') or filename),
|
|
||||||
'amount': float(data.get('amount', 0.0)),
|
|
||||||
'date': date_val or date_hint or today,
|
|
||||||
'time': time_val,
|
|
||||||
'product_name': product_name,
|
|
||||||
}
|
|
||||||
except (json.JSONDecodeError, ValueError, TypeError):
|
|
||||||
pass # not clean JSON — fall through to LLM path
|
|
||||||
|
|
||||||
ocr_failed = not stripped or stripped.startswith('[')
|
ocr_failed = not stripped or stripped.startswith('[')
|
||||||
|
|
||||||
product_list = ''
|
product_list = ''
|
||||||
|
|||||||
@@ -16,10 +16,6 @@ class Settings(BaseSettings):
|
|||||||
ollama_model: str = 'activeblue-chat'
|
ollama_model: str = 'activeblue-chat'
|
||||||
ollama_timeout: int = 300
|
ollama_timeout: int = 300
|
||||||
ollama_max_concurrent: int = 2
|
ollama_max_concurrent: int = 2
|
||||||
# Set to a vision-capable model (e.g. llama3.2-vision:11b) to use
|
|
||||||
# vision OCR for receipt images instead of Tesseract. Leave empty
|
|
||||||
# to keep the Tesseract pipeline.
|
|
||||||
vision_ocr_model: str = ''
|
|
||||||
|
|
||||||
# Anthropic / Claude
|
# Anthropic / Claude
|
||||||
anthropic_api_key: str = ''
|
anthropic_api_key: str = ''
|
||||||
|
|||||||
@@ -80,121 +80,10 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
|||||||
|
|
||||||
|
|
||||||
def _ocr_image(data: bytes, filename: str) -> str:
|
def _ocr_image(data: bytes, filename: str) -> str:
|
||||||
"""Extract text from a receipt image.
|
"""Extract text from a receipt image using Tesseract."""
|
||||||
|
|
||||||
Tries vision-model OCR first when VISION_OCR_MODEL is configured,
|
|
||||||
then falls back to the Tesseract pipeline.
|
|
||||||
"""
|
|
||||||
from agent_service.config import get_settings
|
|
||||||
settings = get_settings()
|
|
||||||
if settings.vision_ocr_model:
|
|
||||||
result = _ocr_image_vision(data, filename,
|
|
||||||
settings.ollama_url,
|
|
||||||
settings.vision_ocr_model)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
|
|
||||||
return _ocr_image_tesseract(data, filename)
|
return _ocr_image_tesseract(data, filename)
|
||||||
|
|
||||||
|
|
||||||
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
|
|
||||||
"""Use an Ollama vision model to extract receipt data directly as JSON.
|
|
||||||
|
|
||||||
Returns a JSON string {vendor, amount, date, time, category} so the
|
|
||||||
expenses agent can skip the second LLM extraction step entirely.
|
|
||||||
Returns empty string on any failure so the caller falls back to Tesseract.
|
|
||||||
"""
|
|
||||||
import json as _json
|
|
||||||
import re as _re
|
|
||||||
|
|
||||||
def _repair_json(s: str) -> str:
|
|
||||||
"""Fix the most common LLM JSON formatting mistakes.
|
|
||||||
|
|
||||||
Handles:
|
|
||||||
- trailing commas before } or ] → {"a":1,} becomes {"a":1}
|
|
||||||
- single-quoted strings → {'a':'b'} becomes {"a":"b"}
|
|
||||||
- unquoted string keys → {a: "b"} becomes {"a": "b"}
|
|
||||||
"""
|
|
||||||
# trailing commas
|
|
||||||
s = _re.sub(r',\s*([}\]])', r'\1', s)
|
|
||||||
# single-quoted strings (careful around apostrophes in values)
|
|
||||||
s = _re.sub(r"'([^']*)'", r'"\1"', s)
|
|
||||||
# unquoted keys: word characters before a colon
|
|
||||||
s = _re.sub(r'(?<!["\w])(\w+)\s*:', r'"\1":', s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ollama as _ollama
|
|
||||||
client = _ollama.Client(host=ollama_url)
|
|
||||||
response = client.chat(
|
|
||||||
model=model,
|
|
||||||
format='json', # Ollama JSON mode — forces syntactically valid output
|
|
||||||
messages=[{
|
|
||||||
'role': 'user',
|
|
||||||
'content': (
|
|
||||||
'You are a receipt data extractor. '
|
|
||||||
'Read this receipt image and extract the following fields. '
|
|
||||||
'Copy values EXACTLY as printed — do NOT guess, infer, or '
|
|
||||||
'invent values you cannot clearly see.\n\n'
|
|
||||||
'Fields to extract:\n'
|
|
||||||
'- vendor: the store or restaurant name exactly as printed; '
|
|
||||||
'empty string if not clearly visible\n'
|
|
||||||
'- amount: the FINAL total the customer paid; find a line '
|
|
||||||
'labeled "Total", "Grand Total", "Amount Due", or "Balance Due"; '
|
|
||||||
'copy the number exactly; do NOT use subtotal, tax, or tip; '
|
|
||||||
'return 0 if no clearly labeled final total is visible\n'
|
|
||||||
'- date: transaction date in YYYY-MM-DD format; '
|
|
||||||
'null if not clearly visible\n'
|
|
||||||
'- time: transaction time in HH:MM 24-hour format; '
|
|
||||||
'null if not clearly visible\n'
|
|
||||||
'- category: one of: meals, fuel, hotel, office, transport, other\n\n'
|
|
||||||
'Return ONLY a valid JSON object, no commentary, no markdown:\n'
|
|
||||||
'{"vendor":"...","amount":0.00,"date":"YYYY-MM-DD or null",'
|
|
||||||
'"time":"HH:MM or null","category":"..."}'
|
|
||||||
),
|
|
||||||
'images': [data],
|
|
||||||
}],
|
|
||||||
)
|
|
||||||
if isinstance(response, dict):
|
|
||||||
raw = (response.get('message', {}).get('content') or '').strip()
|
|
||||||
else:
|
|
||||||
raw = (response.message.content or '').strip()
|
|
||||||
|
|
||||||
# Must contain a JSON object, not prose
|
|
||||||
first, last = raw.find('{'), raw.rfind('}')
|
|
||||||
if first == -1 or last <= first:
|
|
||||||
logger.warning('Vision OCR %s: model returned prose, falling back to Tesseract',
|
|
||||||
filename)
|
|
||||||
return ''
|
|
||||||
json_str = raw[first:last + 1]
|
|
||||||
|
|
||||||
# Parse — on failure attempt common repairs then retry once
|
|
||||||
try:
|
|
||||||
parsed = _json.loads(json_str)
|
|
||||||
except _json.JSONDecodeError as json_err:
|
|
||||||
repaired = _repair_json(json_str)
|
|
||||||
try:
|
|
||||||
parsed = _json.loads(repaired)
|
|
||||||
logger.debug('Vision OCR %s: JSON repaired successfully', filename)
|
|
||||||
except _json.JSONDecodeError:
|
|
||||||
logger.warning('Vision OCR %s: JSON parse failed (%s), falling back',
|
|
||||||
filename, json_err)
|
|
||||||
return ''
|
|
||||||
|
|
||||||
if 'amount' not in parsed:
|
|
||||||
logger.warning('Vision OCR %s: JSON missing amount field, falling back', filename)
|
|
||||||
return ''
|
|
||||||
logger.debug('Vision OCR %s (%s): extracted JSON ok', filename, model)
|
|
||||||
# Re-serialise so downstream always gets clean, canonical JSON
|
|
||||||
return _json.dumps(parsed)
|
|
||||||
except ImportError:
|
|
||||||
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
|
|
||||||
return ''
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning('Vision OCR failed for %s: %s', filename, exc)
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||||
"""Tesseract-based OCR pipeline (fallback)."""
|
"""Tesseract-based OCR pipeline (fallback)."""
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -423,88 +423,12 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# _match_category
|
# _parse_receipt_text — LLM extraction path
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestMatchCategory:
|
|
||||||
PRODUCTS = [
|
|
||||||
{'id': 1, 'name': 'Meals'},
|
|
||||||
{'id': 2, 'name': 'Fuel'},
|
|
||||||
{'id': 3, 'name': 'Hotel'},
|
|
||||||
{'id': 4, 'name': 'Office Supplies'},
|
|
||||||
{'id': 5, 'name': 'Transport'},
|
|
||||||
{'id': 6, 'name': 'Other'},
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_exact_match(self):
|
|
||||||
assert ExpensesAgent._match_category('Meals', self.PRODUCTS) == 'Meals'
|
|
||||||
|
|
||||||
def test_case_insensitive(self):
|
|
||||||
assert ExpensesAgent._match_category('meals', self.PRODUCTS) == 'Meals'
|
|
||||||
assert ExpensesAgent._match_category('FUEL', self.PRODUCTS) == 'Fuel'
|
|
||||||
|
|
||||||
def test_substring_match(self):
|
|
||||||
# 'office' is a substring of 'Office Supplies'
|
|
||||||
assert ExpensesAgent._match_category('office', self.PRODUCTS) == 'Office Supplies'
|
|
||||||
|
|
||||||
def test_fuzzy_match(self):
|
|
||||||
# 'transport' is close to 'Transport'
|
|
||||||
assert ExpensesAgent._match_category('transport', self.PRODUCTS) == 'Transport'
|
|
||||||
|
|
||||||
def test_no_match_returns_empty(self):
|
|
||||||
assert ExpensesAgent._match_category('zxqwerty', self.PRODUCTS) == ''
|
|
||||||
|
|
||||||
def test_empty_category(self):
|
|
||||||
assert ExpensesAgent._match_category('', self.PRODUCTS) == ''
|
|
||||||
|
|
||||||
def test_empty_products(self):
|
|
||||||
assert ExpensesAgent._match_category('meals', []) == ''
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# _parse_receipt_text — vision JSON fast path
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_vision_json_fast_path():
|
async def test_parse_plain_ocr_text_uses_llm():
|
||||||
"""When text is pre-extracted JSON from vision model, skip LLM call."""
|
"""Plain OCR text should go through the LLM extraction path."""
|
||||||
agent = _make_agent()
|
|
||||||
agent._llm.submit = AsyncMock() # should NOT be called
|
|
||||||
|
|
||||||
vision_json = ('{"vendor":"McDonald\'s","amount":12.50,'
|
|
||||||
'"date":"2026-05-09","time":"13:30","category":"meals"}')
|
|
||||||
products = [{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Fuel'}]
|
|
||||||
|
|
||||||
result = await agent._parse_receipt_text(vision_json, 'receipt.jpg',
|
|
||||||
expense_products=products)
|
|
||||||
|
|
||||||
assert result['vendor'] == "McDonald's"
|
|
||||||
assert result['amount'] == 12.50
|
|
||||||
assert result['date'] == '2026-05-09'
|
|
||||||
assert result['time'] == '13:30'
|
|
||||||
assert result['product_name'] == 'Meals'
|
|
||||||
agent._llm.submit.assert_not_called()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_parse_vision_json_null_time():
|
|
||||||
"""Vision model may return the string 'null' for time — normalise to None."""
|
|
||||||
agent = _make_agent()
|
|
||||||
agent._llm.submit = AsyncMock()
|
|
||||||
|
|
||||||
vision_json = '{"vendor":"Shell","amount":45.00,"date":"2026-05-09","time":"null","category":"fuel"}'
|
|
||||||
products = [{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Fuel'}]
|
|
||||||
|
|
||||||
result = await agent._parse_receipt_text(vision_json, 'shell.jpg',
|
|
||||||
expense_products=products)
|
|
||||||
assert result['time'] is None
|
|
||||||
assert result['product_name'] == 'Fuel'
|
|
||||||
agent._llm.submit.assert_not_called()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_parse_non_json_text_falls_through_to_llm():
|
|
||||||
"""Plain OCR text (not JSON) should go through the LLM extraction path."""
|
|
||||||
agent = _make_agent()
|
agent = _make_agent()
|
||||||
llm_resp = MagicMock()
|
llm_resp = MagicMock()
|
||||||
llm_resp.content = '{"vendor":"Acme","amount":9.99,"date":"2026-05-09","time":null,"product_name":"Meals"}'
|
llm_resp.content = '{"vendor":"Acme","amount":9.99,"date":"2026-05-09","time":null,"product_name":"Meals"}'
|
||||||
|
|||||||
Reference in New Issue
Block a user