fix(expenses): detect bank statements, fix default category, improve prompts
- Add _is_likely_bank_statement(): if OCR text has ≥10 lines with dollar
amounts it is almost certainly a bank/card statement screenshot, not a
single receipt. Return skip=True so _act() skips it and adds a note to
the escalations list instead of creating a $1,699 expense line.
- Fix default product selection in _act(): prefer "Meals" over whatever
happens to be first in Odoo's expense product list ("Communication"),
so unrecognised receipts get a sensible fallback category.
- Improve LLM category prompt: remove hardcoded product names (airline →
Transport) that don't exist in every Odoo install; describe business
types semantically so the model picks from the actual available list.
- Mention skipped statements in the final summary message.
- 77 tests, all passing.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,21 @@ _SKIP_LINE_RE = re.compile(
|
|||||||
# Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals)
|
# Any standalone dollar-like amount (optional $, up to 6 digits, 2 decimals)
|
||||||
_ANY_DOLLAR_RE = re.compile(r'(?<!\d)\$?\s*([\d,]{1,6}\.\d{2})(?!\d)')
|
_ANY_DOLLAR_RE = re.compile(r'(?<!\d)\$?\s*([\d,]{1,6}\.\d{2})(?!\d)')
|
||||||
|
|
||||||
|
# A single receipt has at most ~10 lines with dollar amounts (items + tax + total).
|
||||||
|
# Bank / credit-card statements have far more (one per transaction).
|
||||||
|
_STMT_AMOUNT_LINE_THRESHOLD = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _is_likely_bank_statement(text: str) -> bool:
|
||||||
|
"""Return True when the OCR text has too many amount-bearing lines to be a receipt.
|
||||||
|
|
||||||
|
Single receipts: typically 1-9 lines with dollar values.
|
||||||
|
Bank/card statements: 10-50+ lines (one per transaction).
|
||||||
|
"""
|
||||||
|
count = sum(1 for line in text.splitlines() if _ANY_DOLLAR_RE.search(line))
|
||||||
|
return count >= _STMT_AMOUNT_LINE_THRESHOLD
|
||||||
|
|
||||||
|
|
||||||
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
||||||
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
||||||
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
||||||
@@ -237,7 +252,14 @@ class ExpensesAgent(BaseAgent):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
expense_products = await self._et.get_expense_products()
|
expense_products = await self._et.get_expense_products()
|
||||||
default_product_id = expense_products[0]['id'] if expense_products else None
|
# Prefer "Meals" as the fallback category — most receipts are food.
|
||||||
|
# Avoid blindly defaulting to whatever Odoo returns first (often "Communication").
|
||||||
|
_meals = next((p for p in expense_products
|
||||||
|
if p['name'].lower() == 'meals'), None)
|
||||||
|
default_product_id = (
|
||||||
|
_meals['id'] if _meals
|
||||||
|
else (expense_products[0]['id'] if expense_products else None)
|
||||||
|
)
|
||||||
product_map = {p['id']: p['name'] for p in expense_products}
|
product_map = {p['id']: p['name'] for p in expense_products}
|
||||||
logger.info('expenses_agent: %d receipts received, %d expense products available',
|
logger.info('expenses_agent: %d receipts received, %d expense products available',
|
||||||
len(receipts), len(expense_products))
|
len(receipts), len(expense_products))
|
||||||
@@ -281,6 +303,14 @@ class ExpensesAgent(BaseAgent):
|
|||||||
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
||||||
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
||||||
'time': None, 'product_name': ''}
|
'time': None, 'product_name': ''}
|
||||||
|
if parsed.get('skip'):
|
||||||
|
logger.info('expenses_agent: skipping bank/card statement: %s',
|
||||||
|
receipt.get('filename'))
|
||||||
|
self._escalations_list.append(
|
||||||
|
f"Skipped \"{receipt.get('filename')}\": "
|
||||||
|
'looks like a bank or card statement, not a single receipt.'
|
||||||
|
)
|
||||||
|
continue
|
||||||
logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r',
|
logger.info('parsed filename=%r → vendor=%r amount=%s date=%r product=%r',
|
||||||
receipt.get('filename'), parsed.get('vendor'),
|
receipt.get('filename'), parsed.get('vendor'),
|
||||||
parsed.get('amount'), parsed.get('date'), parsed.get('product_name'))
|
parsed.get('amount'), parsed.get('date'), parsed.get('product_name'))
|
||||||
@@ -447,6 +477,20 @@ class ExpensesAgent(BaseAgent):
|
|||||||
stripped = (text or '').strip()
|
stripped = (text or '').strip()
|
||||||
ocr_failed = not stripped or stripped.startswith('[')
|
ocr_failed = not stripped or stripped.startswith('[')
|
||||||
|
|
||||||
|
# ── Bank / card statement detection ──────────────────────────────────
|
||||||
|
# A statement screenshot has many amount-bearing lines; running the
|
||||||
|
# max-scan on it returns a random large transaction, not a total.
|
||||||
|
# Skip these files so they don't produce a wildly wrong expense.
|
||||||
|
if not ocr_failed and _is_likely_bank_statement(stripped):
|
||||||
|
n = sum(1 for l in stripped.splitlines() if _ANY_DOLLAR_RE.search(l))
|
||||||
|
logger.warning(
|
||||||
|
'receipt %s: looks like a bank/card statement (%d amount lines) — skip',
|
||||||
|
filename, n,
|
||||||
|
)
|
||||||
|
return {'vendor': filename, 'amount': 0.0,
|
||||||
|
'date': date_hint or today, 'time': None,
|
||||||
|
'product_name': '', 'skip': True}
|
||||||
|
|
||||||
# ── Amount: regex (deterministic) ────────────────────────────────────
|
# ── Amount: regex (deterministic) ────────────────────────────────────
|
||||||
amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0
|
amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0
|
||||||
|
|
||||||
@@ -475,10 +519,13 @@ class ExpensesAgent(BaseAgent):
|
|||||||
'If this looks like a bank or credit-card statement listing '
|
'If this looks like a bank or credit-card statement listing '
|
||||||
'multiple transactions rather than a single merchant receipt, '
|
'multiple transactions rather than a single merchant receipt, '
|
||||||
'use "". Use "" if no clear business name is visible.\n'
|
'use "". Use "" if no clear business name is visible.\n'
|
||||||
f'"product_name": the single best match from [{product_list}] '
|
f'"product_name": pick the single best match from [{product_list}]. '
|
||||||
'based on the type of business (restaurant→Meals, gas station→Fuel, '
|
'Guide: restaurant / cafe / fast food → food/meal product; '
|
||||||
'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
|
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
||||||
'Use "" if none fit.\n\n'
|
'gas station / petrol / fuel → fuel product; '
|
||||||
|
'hotel / motel / lodging → accommodation product; '
|
||||||
|
'office / tech / hardware store → supplies product. '
|
||||||
|
'Return "" if nothing fits.\n\n'
|
||||||
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
||||||
)
|
)
|
||||||
elif product_list:
|
elif product_list:
|
||||||
@@ -520,8 +567,10 @@ class ExpensesAgent(BaseAgent):
|
|||||||
lines = '\n'.join(f' • {a}' for a in self._actions_taken)
|
lines = '\n'.join(f' • {a}' for a in self._actions_taken)
|
||||||
n_skipped = data.get('n_skipped', 0)
|
n_skipped = data.get('n_skipped', 0)
|
||||||
dup_note = f'\n({n_skipped} duplicate receipt(s) were automatically skipped.)' if n_skipped else ''
|
dup_note = f'\n({n_skipped} duplicate receipt(s) were automatically skipped.)' if n_skipped else ''
|
||||||
|
stmt_skips = [e for e in self._escalations_list if 'statement' in e.lower()]
|
||||||
|
stmt_note = ('\n⚠ ' + '\n⚠ '.join(stmt_skips)) if stmt_skips else ''
|
||||||
summary = (
|
summary = (
|
||||||
f'Expense report created successfully:\n{lines}{dup_note}\n\n'
|
f'Expense report created successfully:\n{lines}{dup_note}{stmt_note}\n\n'
|
||||||
'The report is in draft — open Odoo › Expenses, '
|
'The report is in draft — open Odoo › Expenses, '
|
||||||
'review the amounts, and click Submit to send for approval.'
|
'review the amounts, and click Submit to send for approval.'
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -427,7 +427,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
from agent_service.agents.expenses_agent import (
|
from agent_service.agents.expenses_agent import (
|
||||||
_extract_amount_from_text, _extract_date_from_text,
|
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -502,6 +502,48 @@ class TestExtractAmount:
|
|||||||
assert _extract_amount_from_text(text) == 8.49
|
assert _extract_amount_from_text(text) == 8.49
|
||||||
|
|
||||||
|
|
||||||
|
class TestBankStatementDetection:
|
||||||
|
def _stmt(self, n: int) -> str:
|
||||||
|
"""Generate fake bank statement with n transaction lines."""
|
||||||
|
lines = [f'05/{i+1:02d} MERCHANT {i} $1{i}.99' for i in range(n)]
|
||||||
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
def test_receipt_not_flagged(self):
|
||||||
|
# A typical restaurant receipt has < 10 amount-bearing lines
|
||||||
|
text = 'Acme Cafe\nBurger 12.99\nFries 4.50\nDrink 2.99\nTax 1.65\nTotal 22.13'
|
||||||
|
assert _is_likely_bank_statement(text) is False
|
||||||
|
|
||||||
|
def test_statement_flagged(self):
|
||||||
|
# 10 transaction lines → flagged as statement
|
||||||
|
assert _is_likely_bank_statement(self._stmt(10)) is True
|
||||||
|
|
||||||
|
def test_threshold_boundary(self):
|
||||||
|
assert _is_likely_bank_statement(self._stmt(9)) is False
|
||||||
|
assert _is_likely_bank_statement(self._stmt(10)) is True
|
||||||
|
|
||||||
|
def test_empty_text(self):
|
||||||
|
assert _is_likely_bank_statement('') is False
|
||||||
|
|
||||||
|
def test_no_amounts(self):
|
||||||
|
assert _is_likely_bank_statement('Hello world\nNo prices here') is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_bank_statement_returns_skip():
|
||||||
|
"""Bank statement image must be skipped — no amount, skip=True returned."""
|
||||||
|
agent = _make_agent()
|
||||||
|
# Build fake OCR text with 12 transaction lines
|
||||||
|
stmt_text = '\n'.join(
|
||||||
|
f'05/{i+1:02d} SOME MERCHANT {i} ${10 + i}.99' for i in range(12)
|
||||||
|
)
|
||||||
|
result = await agent._parse_receipt_text(
|
||||||
|
stmt_text, '2026-05-15_bank.png',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||||
|
)
|
||||||
|
assert result.get('skip') is True
|
||||||
|
assert result['amount'] == 0.0
|
||||||
|
|
||||||
|
|
||||||
class TestExtractDate:
|
class TestExtractDate:
|
||||||
def test_iso_format(self):
|
def test_iso_format(self):
|
||||||
assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'
|
assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'
|
||||||
|
|||||||
Reference in New Issue
Block a user