fix(expenses): detect bank statements, fix default category, improve prompts

- Add _is_likely_bank_statement(): if OCR text has ≥10 lines with dollar amounts it is almost certainly a bank/card statement screenshot, not a single receipt. Return skip=True so _act() skips it and adds a note to the escalations list instead of creating a $1,699 expense line. - Fix default product selection in _act(): prefer "Meals" over whatever happens to be first in Odoo's expense product list ("Communication"), so unrecognised receipts get a sensible fallback category. - Improve LLM category prompt: remove hardcoded product names (airline → Transport) that don't exist in every Odoo install; describe business types semantically so the model picks from the actual available list. - Mention skipped statements in the final summary message. - 77 tests, all passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 00:25:44 -04:00
parent 6287b3bcef
commit 77fab52475
2 changed files with 98 additions and 7 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -32,6 +32,21 @@ _SKIP_LINE_RE = re.compile(
 # Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals)
 _ANY_DOLLAR_RE = re.compile(r'(?<!\d)\$?\s*([\d,]{1,6}\.\d{2})(?!\d)')

+# A single receipt has at most ~10 lines with dollar amounts (items + tax + total).
+# Bank / credit-card statements have far more (one per transaction).
+_STMT_AMOUNT_LINE_THRESHOLD = 10
+
+
+def _is_likely_bank_statement(text: str) -> bool:
+    """Return True when the OCR text has too many amount-bearing lines to be a receipt.
+
+    Single receipts: typically 1-9 lines with dollar values.
+    Bank/card statements: 10-50+ lines (one per transaction).
+    """
+    count = sum(1 for line in text.splitlines() if _ANY_DOLLAR_RE.search(line))
+    return count >= _STMT_AMOUNT_LINE_THRESHOLD
+
+
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY
@@ -237,7 +252,14 @@ class ExpensesAgent(BaseAgent):
            return []

        expense_products = await self._et.get_expense_products()
-        default_product_id = expense_products[0]['id'] if expense_products else None
+        # Prefer "Meals" as the fallback category — most receipts are food.
+        # Avoid blindly defaulting to whatever Odoo returns first (often "Communication").
+        _meals = next((p for p in expense_products
+                       if p['name'].lower() == 'meals'), None)
+        default_product_id = (
+            _meals['id'] if _meals
+            else (expense_products[0]['id'] if expense_products else None)
+        )
        product_map = {p['id']: p['name'] for p in expense_products}
        logger.info('expenses_agent: %d receipts received, %d expense products available',
                    len(receipts), len(expense_products))
@@ -281,6 +303,14 @@ class ExpensesAgent(BaseAgent):
                parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
                          'date': receipt.get('date_from_name') or _date.today().isoformat(),
                          'time': None, 'product_name': ''}
+            if parsed.get('skip'):
+                logger.info('expenses_agent: skipping bank/card statement: %s',
+                            receipt.get('filename'))
+                self._escalations_list.append(
+                    f"Skipped \"{receipt.get('filename')}\": "
+                    'looks like a bank or card statement, not a single receipt.'
+                )
+                continue
            logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r',
                        receipt.get('filename'), parsed.get('vendor'),
                        parsed.get('amount'), parsed.get('date'), parsed.get('product_name'))
@@ -447,6 +477,20 @@ class ExpensesAgent(BaseAgent):
        stripped = (text or '').strip()
        ocr_failed = not stripped or stripped.startswith('[')

+        # ── Bank / card statement detection ──────────────────────────────────
+        # A statement screenshot has many amount-bearing lines; running the
+        # max-scan on it returns a random large transaction, not a total.
+        # Skip these files so they don't produce a wildly wrong expense.
+        if not ocr_failed and _is_likely_bank_statement(stripped):
+            n = sum(1 for l in stripped.splitlines() if _ANY_DOLLAR_RE.search(l))
+            logger.warning(
+                'receipt %s: looks like a bank/card statement (%d amount lines) — skip',
+                filename, n,
+            )
+            return {'vendor': filename, 'amount': 0.0,
+                    'date': date_hint or today, 'time': None,
+                    'product_name': '', 'skip': True}
+
        # ── Amount: regex (deterministic) ────────────────────────────────────
        amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0

@@ -475,10 +519,13 @@ class ExpensesAgent(BaseAgent):
                'If this looks like a bank or credit-card statement listing '
                'multiple transactions rather than a single merchant receipt, '
                'use "". Use "" if no clear business name is visible.\n'
-                f'"product_name": the single best match from [{product_list}] '
-                'based on the type of business (restaurant→Meals, gas station→Fuel, '
-                'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
-                'Use "" if none fit.\n\n'
+                f'"product_name": pick the single best match from [{product_list}]. '
+                'Guide: restaurant / cafe / fast food → food/meal product; '
+                'airline / airport / transit / taxi / parking / rental car → travel product; '
+                'gas station / petrol / fuel → fuel product; '
+                'hotel / motel / lodging → accommodation product; '
+                'office / tech / hardware store → supplies product. '
+                'Return "" if nothing fits.\n\n'
                f'Receipt text:\n{excerpt}\n\nJSON only:'
            )
        elif product_list:
@@ -520,8 +567,10 @@ class ExpensesAgent(BaseAgent):
                lines = '\n'.join(f'  • {a}' for a in self._actions_taken)
                n_skipped = data.get('n_skipped', 0)
                dup_note = f'\n({n_skipped} duplicate receipt(s) were automatically skipped.)' if n_skipped else ''
+                stmt_skips = [e for e in self._escalations_list if 'statement' in e.lower()]
+                stmt_note = ('\n⚠ ' + '\n⚠ '.join(stmt_skips)) if stmt_skips else ''
                summary = (
-                    f'Expense report created successfully:\n{lines}{dup_note}\n\n'
+                    f'Expense report created successfully:\n{lines}{dup_note}{stmt_note}\n\n'
                    'The report is in draft — open Odoo › Expenses, '
                    'review the amounts, and click Submit to send for approval.'
                )
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -427,7 +427,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
 # ---------------------------------------------------------------------------

 from agent_service.agents.expenses_agent import (
-    _extract_amount_from_text, _extract_date_from_text,
+    _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
 )


@@ -502,6 +502,48 @@ class TestExtractAmount:
        assert _extract_amount_from_text(text) == 8.49


+class TestBankStatementDetection:
+    def _stmt(self, n: int) -> str:
+        """Generate fake bank statement with n transaction lines."""
+        lines = [f'05/{i+1:02d}  MERCHANT {i}  $1{i}.99' for i in range(n)]
+        return '\n'.join(lines)
+
+    def test_receipt_not_flagged(self):
+        # A typical restaurant receipt has < 10 amount-bearing lines
+        text = 'Acme Cafe\nBurger 12.99\nFries 4.50\nDrink 2.99\nTax 1.65\nTotal 22.13'
+        assert _is_likely_bank_statement(text) is False
+
+    def test_statement_flagged(self):
+        # 10 transaction lines → flagged as statement
+        assert _is_likely_bank_statement(self._stmt(10)) is True
+
+    def test_threshold_boundary(self):
+        assert _is_likely_bank_statement(self._stmt(9)) is False
+        assert _is_likely_bank_statement(self._stmt(10)) is True
+
+    def test_empty_text(self):
+        assert _is_likely_bank_statement('') is False
+
+    def test_no_amounts(self):
+        assert _is_likely_bank_statement('Hello world\nNo prices here') is False
+
+
+@pytest.mark.asyncio
+async def test_parse_bank_statement_returns_skip():
+    """Bank statement image must be skipped — no amount, skip=True returned."""
+    agent = _make_agent()
+    # Build fake OCR text with 12 transaction lines
+    stmt_text = '\n'.join(
+        f'05/{i+1:02d}  SOME MERCHANT {i}  ${10 + i}.99' for i in range(12)
+    )
+    result = await agent._parse_receipt_text(
+        stmt_text, '2026-05-15_bank.png',
+        expense_products=[{'id': 1, 'name': 'Meals'}],
+    )
+    assert result.get('skip') is True
+    assert result['amount'] == 0.0
+
+
 class TestExtractDate:
    def test_iso_format(self):
        assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'