Add OCR debug logging to diagnose receipt extraction quality

Logs per-receipt: OCR text length, first 120 chars of OCR output,
and final parsed vendor/amount/date/product_name.
This will show whether Tesseract is producing usable text.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-20 23:23:13 -04:00
parent e6c3d08990
commit f1a8add84b

View File

@@ -195,6 +195,8 @@ class ExpensesAgent(BaseAgent):
expense_products = await self._et.get_expense_products()
default_product_id = expense_products[0]['id'] if expense_products else None
product_map = {p['id']: p['name'] for p in expense_products}
logger.info('expenses_agent: %d receipts received, %d expense products available',
len(receipts), len(expense_products))
# Pass 1: byte-exact dedup
seen_hashes: set = set()
@@ -208,6 +210,14 @@ class ExpensesAgent(BaseAgent):
seen_hashes.add(h)
unique_receipts.append(r)
# Log OCR quality for each receipt so we can diagnose extraction failures
for r in unique_receipts:
raw_text = r.get('text', '') or ''
ocr_len = len(raw_text)
ocr_preview = raw_text[:120].replace('\n', '')
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
# Parse all receipts concurrently
parse_tasks = [
self._parse_receipt_text(
@@ -227,6 +237,9 @@ class ExpensesAgent(BaseAgent):
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
'date': receipt.get('date_from_name') or _date.today().isoformat(),
'time': None, 'product_name': ''}
logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r',
receipt.get('filename'), parsed.get('vendor'),
parsed.get('amount'), parsed.get('date'), parsed.get('product_name'))
paired.append((receipt, parsed))
# Pass 2: semantic dedup