fix: vision OCR receipt extraction — skip second LLM call, fix total truncation

receipt_parser: change _ocr_image_vision() to extract structured JSON
{vendor,amount,date,time,category} directly from the image instead of
transcribing raw text, so the downstream LLM extraction step is
unnecessary and the two-step error-compounding is eliminated.

expenses_agent: add _match_category() helper to map vision category
labels to expense product names via substring/fuzzy match; add fast
path in _parse_receipt_text() that detects pre-extracted vision JSON
(text starts with '{') and skips the second LLM submit call entirely.
Fix text[:2000] truncation that discarded receipt totals — now keeps
first 1500 + last 1500 chars of long receipts so the grand total at
the bottom is always included.

tests: fix stale test_act_enters_awaiting_confirmation_on_first_pass
(confirmation gate was removed); add TestMatchCategory and three new
tests for the vision JSON fast path and LLM fallthrough.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-20 21:49:31 -04:00
parent 7a0aad3f37
commit 11cc261923
3 changed files with 209 additions and 23 deletions

View File

@@ -317,13 +317,63 @@ class ExpensesAgent(BaseAgent):
return None
@staticmethod
def _match_category(category: str, expense_products: list) -> str:
"""Map a vision-model category label to the nearest expense product name.
Tries exact/substring match first, then a fuzzy SequenceMatcher pass.
Returns empty string when no reasonable match is found.
"""
if not expense_products or not category:
return ''
cat = category.lower().strip()
# Exact or substring match
for p in expense_products:
name = p['name'].lower()
if cat == name or cat in name or name in cat:
return p['name']
# Fuzzy fallback (ratio >= 0.4)
names_lower = [p['name'].lower() for p in expense_products]
matches = difflib.get_close_matches(cat, names_lower, n=1, cutoff=0.4)
if matches:
for p in expense_products:
if p['name'].lower() == matches[0]:
return p['name']
return ''
async def _parse_receipt_text(self, text: str, filename: str,
expense_products: list = None,
date_hint: str = None) -> dict:
today = _date.today().isoformat()
fallback = {'vendor': filename, 'amount': 0.0,
'date': date_hint or today, 'time': None, 'product_name': ''}
ocr_failed = not text or text.startswith('[')
# ── Fast path: vision model already returned structured JSON ──────────
# receipt_parser._ocr_image_vision() returns a JSON string directly
# when a vision model is configured. Skip the second LLM call entirely.
stripped = (text or '').strip()
if stripped.startswith('{'):
try:
data = json.loads(stripped)
if 'amount' in data:
logger.debug('expenses_agent: using vision pre-extracted JSON for %s', filename)
# Map the vision category label → expense product name
product_name = self._match_category(
data.get('category', ''), expense_products or [])
# Vision model sometimes returns the string "null" instead of JSON null
raw_time = data.get('time')
time_val = None if raw_time in (None, 'null', 'None', '') else str(raw_time)
return {
'vendor': str(data.get('vendor') or filename),
'amount': float(data.get('amount', 0.0)),
'date': str(data.get('date') or date_hint or today),
'time': time_val,
'product_name': product_name,
}
except (json.JSONDecodeError, ValueError, TypeError):
pass # not clean JSON — fall through to LLM path
ocr_failed = not stripped or stripped.startswith('[')
product_list = ''
if expense_products:
@@ -341,6 +391,13 @@ class ExpensesAgent(BaseAgent):
f'Return ONLY valid JSON: {{"product_name": "..."}}'
)
else:
# Keep both the header (vendor/date) and footer (totals) of the receipt.
# A plain [:N] cut discards the bottom of long receipts where the grand
# total lives — the primary cause of amount=0 extraction errors.
if len(stripped) > 3000:
receipt_text = stripped[:1500] + '\n[...]\n' + stripped[-1500:]
else:
receipt_text = stripped
prompt = (
'Extract expense details from the following receipt text. '
'Return ONLY valid JSON with these keys:\n'
@@ -354,7 +411,7 @@ class ExpensesAgent(BaseAgent):
'"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
'null if not present),\n'
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
f'Receipt text:\n{text[:2000]}\n\nJSON only:'
f'Receipt text:\n{receipt_text}\n\nJSON only:'
)
try:
resp = await self._llm.submit(

View File

@@ -98,7 +98,13 @@ def _ocr_image(data: bytes, filename: str) -> str:
def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
"""Use an Ollama vision model to read a receipt image."""
"""Use an Ollama vision model to extract receipt data directly as JSON.
Returns a JSON string {vendor, amount, date, time, category} so the
expenses agent can skip the second LLM extraction step entirely.
Returns empty string on any failure so the caller falls back to Tesseract.
"""
import json as _json
try:
import ollama as _ollama
client = _ollama.Client(host=ollama_url)
@@ -107,22 +113,41 @@ def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -
messages=[{
'role': 'user',
'content': (
'This is a photo of a paper receipt. '
'Transcribe ALL text exactly as it appears on the receipt. '
'Preserve every line in order: store name, address, date, time, '
'each line item with price, subtotal, tax, tip if present, and '
'the final total. Output the raw text only — no commentary, '
'no markdown, no explanations.'
'This is a photo of a receipt. Extract these fields:\n'
'- vendor: the store or restaurant name\n'
'- amount: the FINAL total the customer paid. Look for a line '
'labeled "Total", "Grand Total", "Amount Due", or "Balance Due". '
'Do NOT use subtotal, tax, or tip. Return 0 if you cannot find '
'a clear final total.\n'
'- date: transaction date in YYYY-MM-DD format\n'
'- time: transaction time in HH:MM 24-hour format, or null\n'
'- category: one word describing the expense type — one of: '
'meals, fuel, hotel, office, transport, other\n\n'
'Return ONLY a valid JSON object, no commentary, no markdown:\n'
'{"vendor":"...","amount":0.00,"date":"YYYY-MM-DD",'
'"time":"HH:MM or null","category":"..."}'
),
'images': [data],
}],
)
if isinstance(response, dict):
text = (response.get('message', {}).get('content') or '').strip()
raw = (response.get('message', {}).get('content') or '').strip()
else:
text = (response.message.content or '').strip()
logger.debug('Vision OCR %s (%s): %d chars', filename, model, len(text))
return text
raw = (response.message.content or '').strip()
# Must contain a JSON object, not prose
first, last = raw.find('{'), raw.rfind('}')
if first == -1 or last <= first:
logger.warning('Vision OCR %s: model returned prose, falling back to Tesseract',
filename)
return ''
json_str = raw[first:last + 1]
parsed = _json.loads(json_str)
if 'amount' not in parsed:
logger.warning('Vision OCR %s: JSON missing amount field, falling back', filename)
return ''
logger.debug('Vision OCR %s (%s): extracted JSON ok', filename, model)
return json_str
except ImportError:
logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
return ''