Fix receipt parsing quality and approval endpoint
Receipt quality: replace LLM amount/date extraction with regex. LLM was hallucinating 2021/2022 dates and returning '198.40 USD' strings. Amounts now use deterministic regex (Total:/Grand Total:/Amount Due:). Dates: filename timestamp > OCR regex > today (no LLM date guessing). LLM only asked for vendor name + product category. Approval: fix GET /approval/pending 500 by using correct column name 'started_at' instead of 'created_at' (which does not exist). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,10 +3,65 @@ import asyncio
|
|||||||
import difflib
|
import difflib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from datetime import date as _date
|
from datetime import date as _date
|
||||||
from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport
|
from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport
|
||||||
from ..tools.expenses_tools import ExpensesTools
|
from ..tools.expenses_tools import ExpensesTools
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Receipt OCR helpers — regex-based, deterministic extraction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Matches the final-total line on a receipt.
|
||||||
|
# Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", etc.
|
||||||
|
_TOTAL_RE = re.compile(
|
||||||
|
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
|
||||||
|
r'total\s*amount|total\s*charged|you\s*paid|amount\s*paid|total)'
|
||||||
|
r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
||||||
|
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
||||||
|
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_amount_from_text(text: str) -> float:
|
||||||
|
"""Return the final total from OCR receipt text, or 0.0 if not found."""
|
||||||
|
if not text:
|
||||||
|
return 0.0
|
||||||
|
matches = list(_TOTAL_RE.finditer(text))
|
||||||
|
if matches:
|
||||||
|
raw = matches[-1].group(1).replace(',', '') # last match = grand total
|
||||||
|
try:
|
||||||
|
return float(raw)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_date_from_text(text: str) -> str | None:
|
||||||
|
"""Return the first plausible date in OCR text as YYYY-MM-DD, or None."""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
m = _DATE_ISO_RE.search(text)
|
||||||
|
if m:
|
||||||
|
y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||||
|
if 2000 <= y <= 2099 and 1 <= mo <= 12 and 1 <= d <= 31:
|
||||||
|
return f'{y}-{mo:02d}-{d:02d}'
|
||||||
|
m = _DATE_US_RE.search(text)
|
||||||
|
if m:
|
||||||
|
mo, d, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||||
|
if 1 <= mo <= 12 and 1 <= d <= 31 and y >= 2000:
|
||||||
|
return f'{y}-{mo:02d}-{d:02d}'
|
||||||
|
m = _DATE_US_SHORT_RE.search(text)
|
||||||
|
if m:
|
||||||
|
mo, d, yr = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||||
|
if 1 <= mo <= 12 and 1 <= d <= 31:
|
||||||
|
y = 2000 + yr if yr < 50 else 1900 + yr
|
||||||
|
return f'{y}-{mo:02d}-{d:02d}'
|
||||||
|
return None
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
EXPENSES_TOOLS = [
|
EXPENSES_TOOLS = [
|
||||||
@@ -320,70 +375,61 @@ class ExpensesAgent(BaseAgent):
|
|||||||
async def _parse_receipt_text(self, text: str, filename: str,
|
async def _parse_receipt_text(self, text: str, filename: str,
|
||||||
expense_products: list = None,
|
expense_products: list = None,
|
||||||
date_hint: str = None) -> dict:
|
date_hint: str = None) -> dict:
|
||||||
today = _date.today().isoformat()
|
"""Parse a single receipt into structured fields.
|
||||||
fallback = {'vendor': filename, 'amount': 0.0,
|
|
||||||
'date': date_hint or today, 'time': None, 'product_name': ''}
|
|
||||||
|
|
||||||
|
Strategy (most-reliable first):
|
||||||
|
amount → regex on OCR text (deterministic)
|
||||||
|
date → filename timestamp > OCR regex > today
|
||||||
|
vendor → LLM (short excerpt, first ~600 chars)
|
||||||
|
product_name→ LLM (semantic match against expense product list)
|
||||||
|
|
||||||
|
The LLM is intentionally NOT asked for amount or date — the local
|
||||||
|
model hallucinates those fields when OCR text is ambiguous.
|
||||||
|
"""
|
||||||
|
today = _date.today().isoformat()
|
||||||
stripped = (text or '').strip()
|
stripped = (text or '').strip()
|
||||||
ocr_failed = not stripped or stripped.startswith('[')
|
ocr_failed = not stripped or stripped.startswith('[')
|
||||||
|
|
||||||
product_list = ''
|
# ── Amount: regex (deterministic) ────────────────────────────────────
|
||||||
if expense_products:
|
amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0
|
||||||
names = [p['name'] for p in expense_products]
|
|
||||||
product_list = ', '.join(f'"{n}"' for n in names)
|
|
||||||
|
|
||||||
if ocr_failed:
|
# ── Date: filename > OCR regex > today ───────────────────────────────
|
||||||
# No OCR text — still try to classify category from filename/date
|
if date_hint:
|
||||||
if not product_list:
|
date = date_hint
|
||||||
return fallback
|
elif not ocr_failed:
|
||||||
|
date = _extract_date_from_text(stripped) or today
|
||||||
|
else:
|
||||||
|
date = today
|
||||||
|
|
||||||
|
# ── Vendor + Category: LLM (two fields only) ─────────────────────────
|
||||||
|
vendor = filename
|
||||||
|
product_name = ''
|
||||||
|
product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
|
||||||
|
|
||||||
|
if not ocr_failed:
|
||||||
|
# Give LLM only the header of the receipt — vendor is in the first lines
|
||||||
|
excerpt = stripped[:600]
|
||||||
prompt = (
|
prompt = (
|
||||||
f'A receipt photo named "{filename}" could not be read by OCR. '
|
'Return ONLY valid JSON with exactly two keys:\n'
|
||||||
f'Based only on the filename, pick the most likely expense category '
|
'"vendor": the store or restaurant name, copied exactly from the '
|
||||||
f'from this list: [{product_list}]. '
|
'first 1-3 lines of the receipt. Use "" if no clear name.\n'
|
||||||
f'Return ONLY valid JSON: {{"product_name": "..."}}'
|
f'"product_name": the single best match from [{product_list}] '
|
||||||
|
'based on the type of business (restaurant→Meals, gas station→Fuel, '
|
||||||
|
'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). '
|
||||||
|
'Use "" if none fit.\n\n'
|
||||||
|
f'Receipt:\n{excerpt}\n\nJSON only:'
|
||||||
|
)
|
||||||
|
elif product_list:
|
||||||
|
# OCR failed — guess category from filename only
|
||||||
|
prompt = (
|
||||||
|
f'A receipt file named "{filename}" could not be read. '
|
||||||
|
f'Pick the most likely match from [{product_list}] based on the filename, '
|
||||||
|
f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Keep both the header (vendor/date) and footer (totals) of the receipt.
|
return {'vendor': filename, 'amount': amount, 'date': date,
|
||||||
# A plain [:N] cut discards the bottom of long receipts where the grand
|
'time': None, 'product_name': ''}
|
||||||
# total lives — the primary cause of amount=0 extraction errors.
|
|
||||||
if len(stripped) > 3000:
|
|
||||||
receipt_text = stripped[:1500] + '\n[...]\n' + stripped[-1500:]
|
|
||||||
else:
|
|
||||||
receipt_text = stripped
|
|
||||||
|
|
||||||
# When the filename carries a reliable timestamp, inject it directly
|
|
||||||
# so the LLM doesn't try to read (and potentially misread) the date
|
|
||||||
# from garbled OCR text.
|
|
||||||
if date_hint:
|
|
||||||
date_instruction = (
|
|
||||||
f'Use exactly "{date_hint}" — this date was read from the file '
|
|
||||||
f'timestamp and is more reliable than the OCR text.'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
date_instruction = (
|
|
||||||
f'Extract from the receipt text in YYYY-MM-DD format; '
|
|
||||||
f'use {today} only if no date is visible.'
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt = (
|
|
||||||
'You are a receipt data extractor. '
|
|
||||||
'Copy values EXACTLY as they appear in the text — '
|
|
||||||
'do NOT guess, infer, "correct" OCR errors, or invent plausible values.\n\n'
|
|
||||||
'Return ONLY valid JSON with these keys:\n'
|
|
||||||
f'"vendor": merchant name exactly as printed; '
|
|
||||||
f'empty string "" if you cannot find it clearly,\n'
|
|
||||||
f'"amount": the FINAL total — find a line labeled "Total", "Grand Total", '
|
|
||||||
f'"Amount Due", or "Balance Due"; copy the number exactly as written; '
|
|
||||||
f'never use subtotal, tax, or tip lines; '
|
|
||||||
f'return 0 if no clearly labeled final total is present,\n'
|
|
||||||
f'"date": {date_instruction}\n'
|
|
||||||
f'"time": transaction time HH:MM (24-hour) exactly as printed, or null,\n'
|
|
||||||
f'"product_name": best match from [{product_list}] or "".\n\n'
|
|
||||||
f'IMPORTANT: This text came from OCR and may contain garbled characters. '
|
|
||||||
f'If a value looks corrupted, return the safe default (0 / "" / null) '
|
|
||||||
f'rather than substituting a "more logical" value.\n\n'
|
|
||||||
f'Receipt text:\n{receipt_text}\n\nJSON only:'
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
resp = await self._llm.submit(
|
resp = await self._llm.submit(
|
||||||
[{'role': 'user', 'content': prompt}],
|
[{'role': 'user', 'content': prompt}],
|
||||||
@@ -393,16 +439,15 @@ class ExpensesAgent(BaseAgent):
|
|||||||
first, last = raw.find('{'), raw.rfind('}')
|
first, last = raw.find('{'), raw.rfind('}')
|
||||||
if first != -1 and last > first:
|
if first != -1 and last > first:
|
||||||
data = json.loads(raw[first:last + 1])
|
data = json.loads(raw[first:last + 1])
|
||||||
return {
|
v = str(data.get('vendor', '') or '').strip()
|
||||||
'vendor': str(data.get('vendor', filename)),
|
if v:
|
||||||
'amount': float(data.get('amount', 0.0)),
|
vendor = v
|
||||||
'date': str(data.get('date') or date_hint or today),
|
product_name = str(data.get('product_name', '') or '').strip()
|
||||||
'time': data.get('time') or None,
|
|
||||||
'product_name': str(data.get('product_name', '')),
|
|
||||||
}
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning('Receipt parse failed for %s: %s', filename, exc)
|
logger.warning('Receipt vendor/category parse failed for %s: %s', filename, exc)
|
||||||
return fallback
|
|
||||||
|
return {'vendor': vendor, 'amount': amount, 'date': date,
|
||||||
|
'time': None, 'product_name': product_name}
|
||||||
|
|
||||||
async def _report(self) -> AgentReport:
|
async def _report(self) -> AgentReport:
|
||||||
data = self._gathered_data
|
data = self._gathered_data
|
||||||
|
|||||||
@@ -33,8 +33,8 @@ async def list_pending():
|
|||||||
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail='DB not ready')
|
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail='DB not ready')
|
||||||
async with pool.acquire(timeout=10) as conn:
|
async with pool.acquire(timeout=10) as conn:
|
||||||
rows = await conn.fetch(
|
rows = await conn.fetch(
|
||||||
'SELECT directive_id, agent_name, action_type, description, created_at, context_data '
|
'SELECT directive_id, agent_name, action_type, description, started_at, context_data '
|
||||||
'FROM ab_directive_log WHERE status = $1 ORDER BY created_at ASC',
|
'FROM ab_directive_log WHERE status = $1 ORDER BY started_at ASC',
|
||||||
'pending_approval',
|
'pending_approval',
|
||||||
)
|
)
|
||||||
return [
|
return [
|
||||||
@@ -43,7 +43,7 @@ async def list_pending():
|
|||||||
agent=r['agent_name'] or '',
|
agent=r['agent_name'] or '',
|
||||||
action=r['action_type'] or '',
|
action=r['action_type'] or '',
|
||||||
description=r['description'] or '',
|
description=r['description'] or '',
|
||||||
created_at=str(r['created_at']),
|
created_at=str(r['started_at'] or ''),
|
||||||
context=r['context_data'] or {},
|
context=r['context_data'] or {},
|
||||||
)
|
)
|
||||||
for r in rows
|
for r in rows
|
||||||
|
|||||||
@@ -423,15 +423,73 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# _parse_receipt_text — LLM extraction path
|
# _extract_amount_from_text / _extract_date_from_text — regex helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
from agent_service.agents.expenses_agent import (
|
||||||
|
_extract_amount_from_text, _extract_date_from_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractAmount:
|
||||||
|
def test_simple_total(self):
|
||||||
|
assert _extract_amount_from_text('Acme\nTotal: $9.99') == 9.99
|
||||||
|
|
||||||
|
def test_grand_total(self):
|
||||||
|
assert _extract_amount_from_text('Subtotal: $20.00\nGrand Total: $22.46') == 22.46
|
||||||
|
|
||||||
|
def test_amount_due(self):
|
||||||
|
assert _extract_amount_from_text('Amount Due: 198.40') == 198.40
|
||||||
|
|
||||||
|
def test_no_dollar_sign(self):
|
||||||
|
assert _extract_amount_from_text('TOTAL 15.75') == 15.75
|
||||||
|
|
||||||
|
def test_last_match_wins(self):
|
||||||
|
# Grand total should beat subtotal
|
||||||
|
text = 'Subtotal 18.00\nTax 1.50\nTotal 19.50'
|
||||||
|
assert _extract_amount_from_text(text) == 19.50
|
||||||
|
|
||||||
|
def test_empty_text(self):
|
||||||
|
assert _extract_amount_from_text('') == 0.0
|
||||||
|
|
||||||
|
def test_no_total_line(self):
|
||||||
|
assert _extract_amount_from_text('No price here') == 0.0
|
||||||
|
|
||||||
|
def test_comma_in_amount(self):
|
||||||
|
assert _extract_amount_from_text('Grand Total: $1,234.56') == 1234.56
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractDate:
|
||||||
|
def test_iso_format(self):
|
||||||
|
assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'
|
||||||
|
|
||||||
|
def test_slash_iso(self):
|
||||||
|
assert _extract_date_from_text('2026/05/09') == '2026-05-09'
|
||||||
|
|
||||||
|
def test_us_format(self):
|
||||||
|
assert _extract_date_from_text('05/09/2026') == '2026-05-09'
|
||||||
|
|
||||||
|
def test_us_short_year(self):
|
||||||
|
assert _extract_date_from_text('05/09/26') == '2026-05-09'
|
||||||
|
|
||||||
|
def test_no_date(self):
|
||||||
|
assert _extract_date_from_text('No date here') is None
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert _extract_date_from_text('') is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _parse_receipt_text — combined extraction
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_plain_ocr_text_uses_llm():
|
async def test_parse_plain_ocr_text_uses_llm_for_vendor():
|
||||||
"""Plain OCR text should go through the LLM extraction path."""
|
"""Regex extracts amount; LLM called only for vendor + product_name."""
|
||||||
agent = _make_agent()
|
agent = _make_agent()
|
||||||
llm_resp = MagicMock()
|
llm_resp = MagicMock()
|
||||||
llm_resp.content = '{"vendor":"Acme","amount":9.99,"date":"2026-05-09","time":null,"product_name":"Meals"}'
|
# LLM now only returns vendor + product_name
|
||||||
|
llm_resp.content = '{"vendor":"Acme","product_name":"Meals"}'
|
||||||
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
||||||
|
|
||||||
result = await agent._parse_receipt_text(
|
result = await agent._parse_receipt_text(
|
||||||
@@ -439,10 +497,44 @@ async def test_parse_plain_ocr_text_uses_llm():
|
|||||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||||
)
|
)
|
||||||
assert result['vendor'] == 'Acme'
|
assert result['vendor'] == 'Acme'
|
||||||
assert result['amount'] == 9.99
|
assert result['amount'] == 9.99 # from regex, not LLM
|
||||||
|
assert result['product_name'] == 'Meals'
|
||||||
agent._llm.submit.assert_called_once()
|
agent._llm.submit.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_date_hint_overrides_ocr_date():
|
||||||
|
"""date_hint from filename must be used; LLM date should be ignored."""
|
||||||
|
agent = _make_agent()
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
|
||||||
|
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
||||||
|
|
||||||
|
result = await agent._parse_receipt_text(
|
||||||
|
'Shell Gas\n05/09/2021\nTotal: $45.00', 'shell.jpg',
|
||||||
|
date_hint='2026-05-09',
|
||||||
|
)
|
||||||
|
assert result['date'] == '2026-05-09' # filename timestamp wins
|
||||||
|
assert result['amount'] == 45.00
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_parse_ocr_failed_skips_llm_amount():
|
||||||
|
"""When OCR fails, amount=0 and date comes from hint or today."""
|
||||||
|
agent = _make_agent()
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"","product_name":"Meals"}'
|
||||||
|
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
||||||
|
|
||||||
|
result = await agent._parse_receipt_text(
|
||||||
|
'[Image: broken.jpg — OCR failed]', 'broken.jpg',
|
||||||
|
date_hint='2026-05-10',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||||
|
)
|
||||||
|
assert result['amount'] == 0.0
|
||||||
|
assert result['date'] == '2026-05-10'
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# parse_upload — receipt_parser.py
|
# parse_upload — receipt_parser.py
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user