diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index f6be67e..cc9c4cb 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -195,6 +195,8 @@ class ExpensesAgent(BaseAgent): expense_products = await self._et.get_expense_products() default_product_id = expense_products[0]['id'] if expense_products else None product_map = {p['id']: p['name'] for p in expense_products} + logger.info('expenses_agent: %d receipts received, %d expense products available', + len(receipts), len(expense_products)) # Pass 1: byte-exact dedup seen_hashes: set = set() @@ -208,6 +210,14 @@ class ExpensesAgent(BaseAgent): seen_hashes.add(h) unique_receipts.append(r) + # Log OCR quality for each receipt so we can diagnose extraction failures + for r in unique_receipts: + raw_text = r.get('text', '') or '' + ocr_len = len(raw_text) + ocr_preview = raw_text[:120].replace('\n', '↵') + logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r', + r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview) + # Parse all receipts concurrently parse_tasks = [ self._parse_receipt_text( @@ -227,6 +237,9 @@ class ExpensesAgent(BaseAgent): parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0, 'date': receipt.get('date_from_name') or _date.today().isoformat(), 'time': None, 'product_name': ''} + logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r', + receipt.get('filename'), parsed.get('vendor'), + parsed.get('amount'), parsed.get('date'), parsed.get('product_name')) paired.append((receipt, parsed)) # Pass 2: semantic dedup