Add OCR debug logging to diagnose receipt extraction quality
Logs per-receipt: OCR text length, first 120 chars of OCR output, and final parsed vendor/amount/date/product_name. This will show whether Tesseract is producing usable text. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -195,6 +195,8 @@ class ExpensesAgent(BaseAgent):
|
||||
expense_products = await self._et.get_expense_products()
|
||||
default_product_id = expense_products[0]['id'] if expense_products else None
|
||||
product_map = {p['id']: p['name'] for p in expense_products}
|
||||
logger.info('expenses_agent: %d receipts received, %d expense products available',
|
||||
len(receipts), len(expense_products))
|
||||
|
||||
# Pass 1: byte-exact dedup
|
||||
seen_hashes: set = set()
|
||||
@@ -208,6 +210,14 @@ class ExpensesAgent(BaseAgent):
|
||||
seen_hashes.add(h)
|
||||
unique_receipts.append(r)
|
||||
|
||||
# Log OCR quality for each receipt so we can diagnose extraction failures
|
||||
for r in unique_receipts:
|
||||
raw_text = r.get('text', '') or ''
|
||||
ocr_len = len(raw_text)
|
||||
ocr_preview = raw_text[:120].replace('\n', '↵')
|
||||
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
|
||||
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
|
||||
|
||||
# Parse all receipts concurrently
|
||||
parse_tasks = [
|
||||
self._parse_receipt_text(
|
||||
@@ -227,6 +237,9 @@ class ExpensesAgent(BaseAgent):
|
||||
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
||||
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
||||
'time': None, 'product_name': ''}
|
||||
logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r',
|
||||
receipt.get('filename'), parsed.get('vendor'),
|
||||
parsed.get('amount'), parsed.get('date'), parsed.get('product_name'))
|
||||
paired.append((receipt, parsed))
|
||||
|
||||
# Pass 2: semantic dedup
|
||||
|
||||
Reference in New Issue
Block a user