fix(expenses): detect bank statements, fix default category, improve prompts

- Add _is_likely_bank_statement(): if OCR text has ≥10 lines with dollar
  amounts it is almost certainly a bank/card statement screenshot, not a
  single receipt.  Return skip=True so _act() skips it and adds a note to
  the escalations list instead of creating a $1,699 expense line.
- Fix default product selection in _act(): prefer "Meals" over whatever
  happens to be first in Odoo's expense product list ("Communication"),
  so unrecognised receipts get a sensible fallback category.
- Improve LLM category prompt: remove hardcoded product names (airline →
  Transport) that don't exist in every Odoo install; describe business
  types semantically so the model picks from the actual available list.
- Mention skipped statements in the final summary message.
- 77 tests, all passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 00:25:44 -04:00
parent 6287b3bcef
commit 77fab52475
2 changed files with 98 additions and 7 deletions

View File

@@ -32,6 +32,21 @@ _SKIP_LINE_RE = re.compile(
# Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals)
_ANY_DOLLAR_RE = re.compile(r'(?<!\d)\$?\s*([\d,]{1,6}\.\d{2})(?!\d)')
# A single receipt has at most ~10 lines with dollar amounts (items + tax + total).
# Bank / credit-card statements have far more (one per transaction).
_STMT_AMOUNT_LINE_THRESHOLD = 10
def _is_likely_bank_statement(text: str) -> bool:
"""Return True when the OCR text has too many amount-bearing lines to be a receipt.
Single receipts: typically 1-9 lines with dollar values.
Bank/card statements: 10-50+ lines (one per transaction).
"""
count = sum(1 for line in text.splitlines() if _ANY_DOLLAR_RE.search(line))
return count >= _STMT_AMOUNT_LINE_THRESHOLD
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
@@ -237,7 +252,14 @@ class ExpensesAgent(BaseAgent):
return []
expense_products = await self._et.get_expense_products()
default_product_id = expense_products[0]['id'] if expense_products else None
# Prefer "Meals" as the fallback category — most receipts are food.
# Avoid blindly defaulting to whatever Odoo returns first (often "Communication").
_meals = next((p for p in expense_products
if p['name'].lower() == 'meals'), None)
default_product_id = (
_meals['id'] if _meals
else (expense_products[0]['id'] if expense_products else None)
)
product_map = {p['id']: p['name'] for p in expense_products}
logger.info('expenses_agent: %d receipts received, %d expense products available',
len(receipts), len(expense_products))
@@ -281,6 +303,14 @@ class ExpensesAgent(BaseAgent):
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
'date': receipt.get('date_from_name') or _date.today().isoformat(),
'time': None, 'product_name': ''}
if parsed.get('skip'):
logger.info('expenses_agent: skipping bank/card statement: %s',
receipt.get('filename'))
self._escalations_list.append(
f"Skipped \"{receipt.get('filename')}\": "
'looks like a bank or card statement, not a single receipt.'
)
continue
logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r',
receipt.get('filename'), parsed.get('vendor'),
parsed.get('amount'), parsed.get('date'), parsed.get('product_name'))
@@ -447,6 +477,20 @@ class ExpensesAgent(BaseAgent):
stripped = (text or '').strip()
ocr_failed = not stripped or stripped.startswith('[')
# ── Bank / card statement detection ──────────────────────────────────
# A statement screenshot has many amount-bearing lines; running the
# max-scan on it returns a random large transaction, not a total.
# Skip these files so they don't produce a wildly wrong expense.
if not ocr_failed and _is_likely_bank_statement(stripped):
n = sum(1 for l in stripped.splitlines() if _ANY_DOLLAR_RE.search(l))
logger.warning(
'receipt %s: looks like a bank/card statement (%d amount lines) — skip',
filename, n,
)
return {'vendor': filename, 'amount': 0.0,
'date': date_hint or today, 'time': None,
'product_name': '', 'skip': True}
# ── Amount: regex (deterministic) ────────────────────────────────────
amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0
@@ -475,10 +519,13 @@ class ExpensesAgent(BaseAgent):
'If this looks like a bank or credit-card statement listing '
'multiple transactions rather than a single merchant receipt, '
'use "". Use "" if no clear business name is visible.\n'
f'"product_name": the single best match from [{product_list}] '
'based on the type of business (restaurant→Meals, gas station→Fuel, '
'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
'Use "" if none fit.\n\n'
f'"product_name": pick the single best match from [{product_list}]. '
'Guide: restaurant / cafe / fast food → food/meal product; '
'airline / airport / transit / taxi / parking / rental car → travel product; '
'gas station / petrol / fuel → fuel product; '
'hotel / motel / lodging → accommodation product; '
'office / tech / hardware store → supplies product. '
'Return "" if nothing fits.\n\n'
f'Receipt text:\n{excerpt}\n\nJSON only:'
)
elif product_list:
@@ -520,8 +567,10 @@ class ExpensesAgent(BaseAgent):
lines = '\n'.join(f'{a}' for a in self._actions_taken)
n_skipped = data.get('n_skipped', 0)
dup_note = f'\n({n_skipped} duplicate receipt(s) were automatically skipped.)' if n_skipped else ''
stmt_skips = [e for e in self._escalations_list if 'statement' in e.lower()]
stmt_note = ('\n' + '\n'.join(stmt_skips)) if stmt_skips else ''
summary = (
f'Expense report created successfully:\n{lines}{dup_note}\n\n'
f'Expense report created successfully:\n{lines}{dup_note}{stmt_note}\n\n'
'The report is in draft — open Odoo Expenses, '
'review the amounts, and click Submit to send for approval.'
)

View File

@@ -427,7 +427,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
# ---------------------------------------------------------------------------
from agent_service.agents.expenses_agent import (
_extract_amount_from_text, _extract_date_from_text,
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
)
@@ -502,6 +502,48 @@ class TestExtractAmount:
assert _extract_amount_from_text(text) == 8.49
class TestBankStatementDetection:
def _stmt(self, n: int) -> str:
"""Generate fake bank statement with n transaction lines."""
lines = [f'05/{i+1:02d} MERCHANT {i} $1{i}.99' for i in range(n)]
return '\n'.join(lines)
def test_receipt_not_flagged(self):
# A typical restaurant receipt has < 10 amount-bearing lines
text = 'Acme Cafe\nBurger 12.99\nFries 4.50\nDrink 2.99\nTax 1.65\nTotal 22.13'
assert _is_likely_bank_statement(text) is False
def test_statement_flagged(self):
# 10 transaction lines → flagged as statement
assert _is_likely_bank_statement(self._stmt(10)) is True
def test_threshold_boundary(self):
assert _is_likely_bank_statement(self._stmt(9)) is False
assert _is_likely_bank_statement(self._stmt(10)) is True
def test_empty_text(self):
assert _is_likely_bank_statement('') is False
def test_no_amounts(self):
assert _is_likely_bank_statement('Hello world\nNo prices here') is False
@pytest.mark.asyncio
async def test_parse_bank_statement_returns_skip():
"""Bank statement image must be skipped — no amount, skip=True returned."""
agent = _make_agent()
# Build fake OCR text with 12 transaction lines
stmt_text = '\n'.join(
f'05/{i+1:02d} SOME MERCHANT {i} ${10 + i}.99' for i in range(12)
)
result = await agent._parse_receipt_text(
stmt_text, '2026-05-15_bank.png',
expense_products=[{'id': 1, 'name': 'Meals'}],
)
assert result.get('skip') is True
assert result['amount'] == 0.0
class TestExtractDate:
def test_iso_format(self):
assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'