fix: vision OCR receipt extraction — skip second LLM call, fix total truncation
receipt_parser: change _ocr_image_vision() to extract structured JSON
{vendor,amount,date,time,category} directly from the image instead of
transcribing raw text, so the downstream LLM extraction step is
unnecessary and the two-step error-compounding is eliminated.
expenses_agent: add _match_category() helper to map vision category
labels to expense product names via substring/fuzzy match; add fast
path in _parse_receipt_text() that detects pre-extracted vision JSON
(text starts with '{') and skips the second LLM submit call entirely.
Fix text[:2000] truncation that discarded receipt totals — now keeps
first 1500 + last 1500 chars of long receipts so the grand total at
the bottom is always included.
tests: fix stale test_act_enters_awaiting_confirmation_on_first_pass
(confirmation gate was removed); add TestMatchCategory and three new
tests for the vision JSON fast path and LLM fallthrough.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -98,7 +98,13 @@ def _ocr_image(data: bytes, filename: str) -> str:
|
||||
|
||||
|
||||
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
|
||||
"""Use an Ollama vision model to read a receipt image."""
|
||||
"""Use an Ollama vision model to extract receipt data directly as JSON.
|
||||
|
||||
Returns a JSON string {vendor, amount, date, time, category} so the
|
||||
expenses agent can skip the second LLM extraction step entirely.
|
||||
Returns empty string on any failure so the caller falls back to Tesseract.
|
||||
"""
|
||||
import json as _json
|
||||
try:
|
||||
import ollama as _ollama
|
||||
client = _ollama.Client(host=ollama_url)
|
||||
@@ -107,22 +113,41 @@ def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -
|
||||
messages=[{
|
||||
'role': 'user',
|
||||
'content': (
|
||||
'This is a photo of a paper receipt. '
|
||||
'Transcribe ALL text exactly as it appears on the receipt. '
|
||||
'Preserve every line in order: store name, address, date, time, '
|
||||
'each line item with price, subtotal, tax, tip if present, and '
|
||||
'the final total. Output the raw text only — no commentary, '
|
||||
'no markdown, no explanations.'
|
||||
'This is a photo of a receipt. Extract these fields:\n'
|
||||
'- vendor: the store or restaurant name\n'
|
||||
'- amount: the FINAL total the customer paid. Look for a line '
|
||||
'labeled "Total", "Grand Total", "Amount Due", or "Balance Due". '
|
||||
'Do NOT use subtotal, tax, or tip. Return 0 if you cannot find '
|
||||
'a clear final total.\n'
|
||||
'- date: transaction date in YYYY-MM-DD format\n'
|
||||
'- time: transaction time in HH:MM 24-hour format, or null\n'
|
||||
'- category: one word describing the expense type — one of: '
|
||||
'meals, fuel, hotel, office, transport, other\n\n'
|
||||
'Return ONLY a valid JSON object, no commentary, no markdown:\n'
|
||||
'{"vendor":"...","amount":0.00,"date":"YYYY-MM-DD",'
|
||||
'"time":"HH:MM or null","category":"..."}'
|
||||
),
|
||||
'images': [data],
|
||||
}],
|
||||
)
|
||||
if isinstance(response, dict):
|
||||
text = (response.get('message', {}).get('content') or '').strip()
|
||||
raw = (response.get('message', {}).get('content') or '').strip()
|
||||
else:
|
||||
text = (response.message.content or '').strip()
|
||||
logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text))
|
||||
return text
|
||||
raw = (response.message.content or '').strip()
|
||||
|
||||
# Must contain a JSON object, not prose
|
||||
first, last = raw.find('{'), raw.rfind('}')
|
||||
if first == -1 or last <= first:
|
||||
logger.warning('Vision OCR %s: model returned prose, falling back to Tesseract',
|
||||
filename)
|
||||
return ''
|
||||
json_str = raw[first:last + 1]
|
||||
parsed = _json.loads(json_str)
|
||||
if 'amount' not in parsed:
|
||||
logger.warning('Vision OCR %s: JSON missing amount field, falling back', filename)
|
||||
return ''
|
||||
logger.debug('Vision OCR %s (%s): extracted JSON ok', filename, model)
|
||||
return json_str
|
||||
except ImportError:
|
||||
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
|
||||
return ''
|
||||
|
||||
Reference in New Issue
Block a user