diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index fcb5d37..a00222b 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -32,6 +32,21 @@ _SKIP_LINE_RE = re.compile( # Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals) _ANY_DOLLAR_RE = re.compile(r'(? bool: + """Return True when the OCR text has too many amount-bearing lines to be a receipt. + + Single receipts: typically 1-9 lines with dollar values. + Bank/card statements: 10-50+ lines (one per transaction). + """ + count = sum(1 for line in text.splitlines() if _ANY_DOLLAR_RE.search(line)) + return count >= _STMT_AMOUNT_LINE_THRESHOLD + + _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY @@ -237,7 +252,14 @@ class ExpensesAgent(BaseAgent): return [] expense_products = await self._et.get_expense_products() - default_product_id = expense_products[0]['id'] if expense_products else None + # Prefer "Meals" as the fallback category — most receipts are food. + # Avoid blindly defaulting to whatever Odoo returns first (often "Communication"). + _meals = next((p for p in expense_products + if p['name'].lower() == 'meals'), None) + default_product_id = ( + _meals['id'] if _meals + else (expense_products[0]['id'] if expense_products else None) + ) product_map = {p['id']: p['name'] for p in expense_products} logger.info('expenses_agent: %d receipts received, %d expense products available', len(receipts), len(expense_products)) @@ -281,6 +303,14 @@ class ExpensesAgent(BaseAgent): parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0, 'date': receipt.get('date_from_name') or _date.today().isoformat(), 'time': None, 'product_name': ''} + if parsed.get('skip'): + logger.info('expenses_agent: skipping bank/card statement: %s', + receipt.get('filename')) + self._escalations_list.append( + f"Skipped \"{receipt.get('filename')}\": " + 'looks like a bank or card statement, not a single receipt.' + ) + continue logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r', receipt.get('filename'), parsed.get('vendor'), parsed.get('amount'), parsed.get('date'), parsed.get('product_name')) @@ -447,6 +477,20 @@ class ExpensesAgent(BaseAgent): stripped = (text or '').strip() ocr_failed = not stripped or stripped.startswith('[') + # ── Bank / card statement detection ────────────────────────────────── + # A statement screenshot has many amount-bearing lines; running the + # max-scan on it returns a random large transaction, not a total. + # Skip these files so they don't produce a wildly wrong expense. + if not ocr_failed and _is_likely_bank_statement(stripped): + n = sum(1 for l in stripped.splitlines() if _ANY_DOLLAR_RE.search(l)) + logger.warning( + 'receipt %s: looks like a bank/card statement (%d amount lines) — skip', + filename, n, + ) + return {'vendor': filename, 'amount': 0.0, + 'date': date_hint or today, 'time': None, + 'product_name': '', 'skip': True} + # ── Amount: regex (deterministic) ──────────────────────────────────── amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0 @@ -475,10 +519,13 @@ class ExpensesAgent(BaseAgent): 'If this looks like a bank or credit-card statement listing ' 'multiple transactions rather than a single merchant receipt, ' 'use "". Use "" if no clear business name is visible.\n' - f'"product_name": the single best match from [{product_list}] ' - 'based on the type of business (restaurant→Meals, gas station→Fuel, ' - 'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). ' - 'Use "" if none fit.\n\n' + f'"product_name": pick the single best match from [{product_list}]. ' + 'Guide: restaurant / cafe / fast food → food/meal product; ' + 'airline / airport / transit / taxi / parking / rental car → travel product; ' + 'gas station / petrol / fuel → fuel product; ' + 'hotel / motel / lodging → accommodation product; ' + 'office / tech / hardware store → supplies product. ' + 'Return "" if nothing fits.\n\n' f'Receipt text:\n{excerpt}\n\nJSON only:' ) elif product_list: @@ -520,8 +567,10 @@ class ExpensesAgent(BaseAgent): lines = '\n'.join(f' • {a}' for a in self._actions_taken) n_skipped = data.get('n_skipped', 0) dup_note = f'\n({n_skipped} duplicate receipt(s) were automatically skipped.)' if n_skipped else '' + stmt_skips = [e for e in self._escalations_list if 'statement' in e.lower()] + stmt_note = ('\n⚠ ' + '\n⚠ '.join(stmt_skips)) if stmt_skips else '' summary = ( - f'Expense report created successfully:\n{lines}{dup_note}\n\n' + f'Expense report created successfully:\n{lines}{dup_note}{stmt_note}\n\n' 'The report is in draft — open Odoo › Expenses, ' 'review the amounts, and click Submit to send for approval.' ) diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index b3af86e..5a8d388 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -427,7 +427,7 @@ async def test_act_no_employee_returns_empty_and_escalates(): # --------------------------------------------------------------------------- from agent_service.agents.expenses_agent import ( - _extract_amount_from_text, _extract_date_from_text, + _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement, ) @@ -502,6 +502,48 @@ class TestExtractAmount: assert _extract_amount_from_text(text) == 8.49 +class TestBankStatementDetection: + def _stmt(self, n: int) -> str: + """Generate fake bank statement with n transaction lines.""" + lines = [f'05/{i+1:02d} MERCHANT {i} $1{i}.99' for i in range(n)] + return '\n'.join(lines) + + def test_receipt_not_flagged(self): + # A typical restaurant receipt has < 10 amount-bearing lines + text = 'Acme Cafe\nBurger 12.99\nFries 4.50\nDrink 2.99\nTax 1.65\nTotal 22.13' + assert _is_likely_bank_statement(text) is False + + def test_statement_flagged(self): + # 10 transaction lines → flagged as statement + assert _is_likely_bank_statement(self._stmt(10)) is True + + def test_threshold_boundary(self): + assert _is_likely_bank_statement(self._stmt(9)) is False + assert _is_likely_bank_statement(self._stmt(10)) is True + + def test_empty_text(self): + assert _is_likely_bank_statement('') is False + + def test_no_amounts(self): + assert _is_likely_bank_statement('Hello world\nNo prices here') is False + + +@pytest.mark.asyncio +async def test_parse_bank_statement_returns_skip(): + """Bank statement image must be skipped — no amount, skip=True returned.""" + agent = _make_agent() + # Build fake OCR text with 12 transaction lines + stmt_text = '\n'.join( + f'05/{i+1:02d} SOME MERCHANT {i} ${10 + i}.99' for i in range(12) + ) + result = await agent._parse_receipt_text( + stmt_text, '2026-05-15_bank.png', + expense_products=[{'id': 1, 'name': 'Meals'}], + ) + assert result.get('skip') is True + assert result['amount'] == 0.0 + + class TestExtractDate: def test_iso_format(self): assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'