Fix receipt parsing quality and approval endpoint
Receipt quality: replace LLM amount/date extraction with regex. LLM was hallucinating 2021/2022 dates and returning '198.40 USD' strings. Amounts now use deterministic regex (Total:/Grand Total:/Amount Due:). Dates: filename timestamp > OCR regex > today (no LLM date guessing). LLM only asked for vendor name + product category. Approval: fix GET /approval/pending 500 by using correct column name 'started_at' instead of 'created_at' (which does not exist). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -423,15 +423,73 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _parse_receipt_text — LLM extraction path
|
||||
# _extract_amount_from_text / _extract_date_from_text — regex helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from agent_service.agents.expenses_agent import (
|
||||
_extract_amount_from_text, _extract_date_from_text,
|
||||
)
|
||||
|
||||
|
||||
class TestExtractAmount:
|
||||
def test_simple_total(self):
|
||||
assert _extract_amount_from_text('Acme\nTotal: $9.99') == 9.99
|
||||
|
||||
def test_grand_total(self):
|
||||
assert _extract_amount_from_text('Subtotal: $20.00\nGrand Total: $22.46') == 22.46
|
||||
|
||||
def test_amount_due(self):
|
||||
assert _extract_amount_from_text('Amount Due: 198.40') == 198.40
|
||||
|
||||
def test_no_dollar_sign(self):
|
||||
assert _extract_amount_from_text('TOTAL 15.75') == 15.75
|
||||
|
||||
def test_last_match_wins(self):
|
||||
# Grand total should beat subtotal
|
||||
text = 'Subtotal 18.00\nTax 1.50\nTotal 19.50'
|
||||
assert _extract_amount_from_text(text) == 19.50
|
||||
|
||||
def test_empty_text(self):
|
||||
assert _extract_amount_from_text('') == 0.0
|
||||
|
||||
def test_no_total_line(self):
|
||||
assert _extract_amount_from_text('No price here') == 0.0
|
||||
|
||||
def test_comma_in_amount(self):
|
||||
assert _extract_amount_from_text('Grand Total: $1,234.56') == 1234.56
|
||||
|
||||
|
||||
class TestExtractDate:
|
||||
def test_iso_format(self):
|
||||
assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09'
|
||||
|
||||
def test_slash_iso(self):
|
||||
assert _extract_date_from_text('2026/05/09') == '2026-05-09'
|
||||
|
||||
def test_us_format(self):
|
||||
assert _extract_date_from_text('05/09/2026') == '2026-05-09'
|
||||
|
||||
def test_us_short_year(self):
|
||||
assert _extract_date_from_text('05/09/26') == '2026-05-09'
|
||||
|
||||
def test_no_date(self):
|
||||
assert _extract_date_from_text('No date here') is None
|
||||
|
||||
def test_empty(self):
|
||||
assert _extract_date_from_text('') is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _parse_receipt_text — combined extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_plain_ocr_text_uses_llm():
|
||||
"""Plain OCR text should go through the LLM extraction path."""
|
||||
async def test_parse_plain_ocr_text_uses_llm_for_vendor():
|
||||
"""Regex extracts amount; LLM called only for vendor + product_name."""
|
||||
agent = _make_agent()
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"Acme","amount":9.99,"date":"2026-05-09","time":null,"product_name":"Meals"}'
|
||||
# LLM now only returns vendor + product_name
|
||||
llm_resp.content = '{"vendor":"Acme","product_name":"Meals"}'
|
||||
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
||||
|
||||
result = await agent._parse_receipt_text(
|
||||
@@ -439,10 +497,44 @@ async def test_parse_plain_ocr_text_uses_llm():
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||
)
|
||||
assert result['vendor'] == 'Acme'
|
||||
assert result['amount'] == 9.99
|
||||
assert result['amount'] == 9.99 # from regex, not LLM
|
||||
assert result['product_name'] == 'Meals'
|
||||
agent._llm.submit.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_date_hint_overrides_ocr_date():
|
||||
"""date_hint from filename must be used; LLM date should be ignored."""
|
||||
agent = _make_agent()
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
|
||||
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
||||
|
||||
result = await agent._parse_receipt_text(
|
||||
'Shell Gas\n05/09/2021\nTotal: $45.00', 'shell.jpg',
|
||||
date_hint='2026-05-09',
|
||||
)
|
||||
assert result['date'] == '2026-05-09' # filename timestamp wins
|
||||
assert result['amount'] == 45.00
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_ocr_failed_skips_llm_amount():
|
||||
"""When OCR fails, amount=0 and date comes from hint or today."""
|
||||
agent = _make_agent()
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"","product_name":"Meals"}'
|
||||
agent._llm.submit = AsyncMock(return_value=llm_resp)
|
||||
|
||||
result = await agent._parse_receipt_text(
|
||||
'[Image: broken.jpg — OCR failed]', 'broken.jpg',
|
||||
date_hint='2026-05-10',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||
)
|
||||
assert result['amount'] == 0.0
|
||||
assert result['date'] == '2026-05-10'
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse_upload — receipt_parser.py
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user