Add vision LLM path for receipt vendor/category identification
When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
||||
|
||||
from agent_service.agents.expenses_agent import (
|
||||
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
|
||||
_MONTH_MAP,
|
||||
_MONTH_MAP, _get_vision_mode,
|
||||
)
|
||||
|
||||
|
||||
@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vendor_prompt_does_not_contain_mcdonalds():
|
||||
"""The vendor LLM prompt must not reference 'McDonald' as a correction
|
||||
example — it biases the model toward returning McDonald's whenever OCR
|
||||
text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
|
||||
misidentified as McDonald's.
|
||||
"""The text-path vendor prompt must not reference 'McDonald' — it biases
|
||||
the model toward returning McDonald's whenever OCR text is unclear.
|
||||
Pinned to text mode so vision path (which has its own cleaner prompt) does
|
||||
not interfere.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
captured: list[str] = []
|
||||
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
await agent._parse_receipt_text(
|
||||
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
|
||||
'homedepot.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
|
||||
)
|
||||
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
|
||||
await agent._parse_receipt_text(
|
||||
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
|
||||
'homedepot.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
|
||||
)
|
||||
|
||||
full_prompt = ' '.join(captured)
|
||||
assert 'McDonald' not in full_prompt, (
|
||||
"Vendor prompt must not contain 'McDonald' — it biases the model toward "
|
||||
"returning McDonald's for any ambiguous receipt."
|
||||
"Text-path prompt must not contain 'McDonald' — it biases the model."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
||||
"""Prompt must explicitly tell the LLM not to substitute a brand name that
|
||||
isn't in the OCR text — prevents "default to well-known fast food" behaviour.
|
||||
"""
|
||||
"""Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
|
||||
agent = _make_agent()
|
||||
captured: list[str] = []
|
||||
|
||||
@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
await agent._parse_receipt_text(
|
||||
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
|
||||
'sergios.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||
)
|
||||
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
|
||||
await agent._parse_receipt_text(
|
||||
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
|
||||
'sergios.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||
)
|
||||
|
||||
full_prompt = ' '.join(captured)
|
||||
# The prompt should warn the model not to invent brand names
|
||||
assert 'only use a brand name' in full_prompt.lower() or \
|
||||
'do not' in full_prompt.lower() or \
|
||||
'not substitute' in full_prompt.lower(), (
|
||||
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vision LLM path — _parse_receipt_text with b64/mimetype
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vision_path_sends_image_to_llm():
|
||||
"""In vision mode, the LLM call includes an 'images' key with the b64 data."""
|
||||
agent = _make_agent()
|
||||
captured_messages: list = []
|
||||
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
|
||||
|
||||
async def _capture(messages, caller=None):
|
||||
captured_messages.extend(messages)
|
||||
return llm_resp
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
|
||||
result = await agent._parse_receipt_text(
|
||||
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Supplies'}],
|
||||
b64='FAKEBASE64DATA',
|
||||
mimetype='image/jpeg',
|
||||
)
|
||||
|
||||
assert result['vendor'] == 'Home Depot'
|
||||
assert result['amount'] == 36.78
|
||||
assert len(captured_messages) == 1
|
||||
msg = captured_messages[0]
|
||||
assert 'images' in msg, "Vision path must include 'images' in LLM message"
|
||||
assert msg['images'] == ['FAKEBASE64DATA']
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_text_mode_skips_vision_even_with_image():
|
||||
"""When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
|
||||
agent = _make_agent()
|
||||
captured_messages: list = []
|
||||
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
|
||||
|
||||
async def _capture(messages, caller=None):
|
||||
captured_messages.extend(messages)
|
||||
return llm_resp
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
|
||||
await agent._parse_receipt_text(
|
||||
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Supplies'}],
|
||||
b64='FAKEBASE64DATA',
|
||||
mimetype='image/jpeg',
|
||||
)
|
||||
|
||||
assert len(captured_messages) == 1
|
||||
assert 'images' not in captured_messages[0], (
|
||||
"Text mode must NOT send images to the LLM."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vision_falls_back_to_text_on_llm_error():
|
||||
"""If the vision LLM call raises, the text path is tried as fallback."""
|
||||
agent = _make_agent()
|
||||
call_count = [0]
|
||||
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
|
||||
|
||||
async def _first_fails(messages, caller=None):
|
||||
call_count[0] += 1
|
||||
if call_count[0] == 1:
|
||||
raise RuntimeError('simulated vision model error')
|
||||
return llm_resp
|
||||
|
||||
agent._llm.submit = _first_fails
|
||||
|
||||
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
|
||||
result = await agent._parse_receipt_text(
|
||||
'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Fuel'}],
|
||||
b64='FAKEBASE64DATA',
|
||||
mimetype='image/jpeg',
|
||||
)
|
||||
|
||||
assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
|
||||
assert result['vendor'] == 'Shell'
|
||||
assert result['amount'] == 55.00
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_image_mimetype_uses_text_path_in_vision_mode():
|
||||
"""PDFs and text files must always use the text path even in vision mode."""
|
||||
agent = _make_agent()
|
||||
captured_messages: list = []
|
||||
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
|
||||
|
||||
async def _capture(messages, caller=None):
|
||||
captured_messages.extend(messages)
|
||||
return llm_resp
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
|
||||
await agent._parse_receipt_text(
|
||||
'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
|
||||
expense_products=[{'id': 1, 'name': 'Travel'}],
|
||||
b64='FAKEBASE64DATA',
|
||||
mimetype='application/pdf', # NOT an image — no vision
|
||||
)
|
||||
|
||||
assert len(captured_messages) == 1
|
||||
assert 'images' not in captured_messages[0], (
|
||||
"PDF receipts must not be sent as images even in vision mode."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse_upload — receipt_parser.py
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user