fix: vision OCR receipt extraction — skip second LLM call, fix total truncation
receipt_parser: change _ocr_image_vision() to extract structured JSON
{vendor,amount,date,time,category} directly from the image instead of
transcribing raw text, so the downstream LLM extraction step is
unnecessary and the two-step error-compounding is eliminated.
expenses_agent: add _match_category() helper to map vision category
labels to expense product names via substring/fuzzy match; add fast
path in _parse_receipt_text() that detects pre-extracted vision JSON
(text starts with '{') and skips the second LLM submit call entirely.
Fix text[:2000] truncation that discarded receipt totals — now keeps
first 1500 + last 1500 chars of long receipts so the grand total at
the bottom is always included.
tests: fix stale test_act_enters_awaiting_confirmation_on_first_pass
(confirmation gate was removed); add TestMatchCategory and three new
tests for the vision JSON fast path and LLM fallthrough.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -317,13 +317,63 @@ class ExpensesAgent(BaseAgent):
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _match_category(category: str, expense_products: list) -> str:
|
||||
"""Map a vision-model category label to the nearest expense product name.
|
||||
|
||||
Tries exact/substring match first, then a fuzzy SequenceMatcher pass.
|
||||
Returns empty string when no reasonable match is found.
|
||||
"""
|
||||
if not expense_products or not category:
|
||||
return ''
|
||||
cat = category.lower().strip()
|
||||
# Exact or substring match
|
||||
for p in expense_products:
|
||||
name = p['name'].lower()
|
||||
if cat == name or cat in name or name in cat:
|
||||
return p['name']
|
||||
# Fuzzy fallback (ratio >= 0.4)
|
||||
names_lower = [p['name'].lower() for p in expense_products]
|
||||
matches = difflib.get_close_matches(cat, names_lower, n=1, cutoff=0.4)
|
||||
if matches:
|
||||
for p in expense_products:
|
||||
if p['name'].lower() == matches[0]:
|
||||
return p['name']
|
||||
return ''
|
||||
|
||||
async def _parse_receipt_text(self, text: str, filename: str,
|
||||
expense_products: list = None,
|
||||
date_hint: str = None) -> dict:
|
||||
today = _date.today().isoformat()
|
||||
fallback = {'vendor': filename, 'amount': 0.0,
|
||||
'date': date_hint or today, 'time': None, 'product_name': ''}
|
||||
ocr_failed = not text or text.startswith('[')
|
||||
|
||||
# ── Fast path: vision model already returned structured JSON ──────────
|
||||
# receipt_parser._ocr_image_vision() returns a JSON string directly
|
||||
# when a vision model is configured. Skip the second LLM call entirely.
|
||||
stripped = (text or '').strip()
|
||||
if stripped.startswith('{'):
|
||||
try:
|
||||
data = json.loads(stripped)
|
||||
if 'amount' in data:
|
||||
logger.debug('expenses_agent: using vision pre-extracted JSON for %s', filename)
|
||||
# Map the vision category label → expense product name
|
||||
product_name = self._match_category(
|
||||
data.get('category', ''), expense_products or [])
|
||||
# Vision model sometimes returns the string "null" instead of JSON null
|
||||
raw_time = data.get('time')
|
||||
time_val = None if raw_time in (None, 'null', 'None', '') else str(raw_time)
|
||||
return {
|
||||
'vendor': str(data.get('vendor') or filename),
|
||||
'amount': float(data.get('amount', 0.0)),
|
||||
'date': str(data.get('date') or date_hint or today),
|
||||
'time': time_val,
|
||||
'product_name': product_name,
|
||||
}
|
||||
except (json.JSONDecodeError, ValueError, TypeError):
|
||||
pass # not clean JSON — fall through to LLM path
|
||||
|
||||
ocr_failed = not stripped or stripped.startswith('[')
|
||||
|
||||
product_list = ''
|
||||
if expense_products:
|
||||
@@ -341,6 +391,13 @@ class ExpensesAgent(BaseAgent):
|
||||
f'Return ONLY valid JSON: {{"product_name": "..."}}'
|
||||
)
|
||||
else:
|
||||
# Keep both the header (vendor/date) and footer (totals) of the receipt.
|
||||
# A plain [:N] cut discards the bottom of long receipts where the grand
|
||||
# total lives — the primary cause of amount=0 extraction errors.
|
||||
if len(stripped) > 3000:
|
||||
receipt_text = stripped[:1500] + '\n[...]\n' + stripped[-1500:]
|
||||
else:
|
||||
receipt_text = stripped
|
||||
prompt = (
|
||||
'Extract expense details from the following receipt text. '
|
||||
'Return ONLY valid JSON with these keys:\n'
|
||||
@@ -354,7 +411,7 @@ class ExpensesAgent(BaseAgent):
|
||||
'"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
|
||||
'null if not present),\n'
|
||||
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
|
||||
f'Receipt text:\n{text[:2000]}\n\nJSON only:'
|
||||
f'Receipt text:\n{receipt_text}\n\nJSON only:'
|
||||
)
|
||||
try:
|
||||
resp = await self._llm.submit(
|
||||
|
||||
Reference in New Issue
Block a user