Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent
directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of
the OCR text excerpt.  The model can read logos, stylised fonts, and layouts
that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.).

Architecture:
- amount + date: always from Tesseract regex (deterministic, never LLM)
- vendor + category: vision LLM when image available, text LLM as fallback
- Fallthrough: if vision call fails for any reason, text path is tried next
- PDF/TXT/HTML receipts: always use text path (not visual media)

Revert instantly without a rebuild:
  echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env
  docker compose up -d agent-service

config.py: add receipt_vision_mode setting (default 'vision')
expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper,
  dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64
tests: 92 passing — 4 new vision tests, 2 existing prompt tests
  pinned to text mode via _get_vision_mode patch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions

View File

@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
from agent_service.agents.expenses_agent import (
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
_MONTH_MAP,
_MONTH_MAP, _get_vision_mode,
)
@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():
@pytest.mark.asyncio
async def test_vendor_prompt_does_not_contain_mcdonalds():
"""The vendor LLM prompt must not reference 'McDonald' as a correction
example — it biases the model toward returning McDonald's whenever OCR
text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
misidentified as McDonald's.
"""The text-path vendor prompt must not reference 'McDonald' — it biases
the model toward returning McDonald's whenever OCR text is unclear.
Pinned to text mode so vision path (which has its own cleaner prompt) does
not interfere.
"""
agent = _make_agent()
captured: list[str] = []
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():
agent._llm.submit = _capture
await agent._parse_receipt_text(
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
)
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
await agent._parse_receipt_text(
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
)
full_prompt = ' '.join(captured)
assert 'McDonald' not in full_prompt, (
"Vendor prompt must not contain 'McDonald' — it biases the model toward "
"returning McDonald's for any ambiguous receipt."
"Text-path prompt must not contain 'McDonald' — it biases the model."
)
@pytest.mark.asyncio
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
"""Prompt must explicitly tell the LLM not to substitute a brand name that
isn't in the OCR text — prevents "default to well-known fast food" behaviour.
"""
"""Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
agent = _make_agent()
captured: list[str] = []
@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
agent._llm.submit = _capture
await agent._parse_receipt_text(
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
'sergios.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}],
)
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
await agent._parse_receipt_text(
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
'sergios.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}],
)
full_prompt = ' '.join(captured)
# The prompt should warn the model not to invent brand names
assert 'only use a brand name' in full_prompt.lower() or \
'do not' in full_prompt.lower() or \
'not substitute' in full_prompt.lower(), (
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
)
# ---------------------------------------------------------------------------
# Vision LLM path — _parse_receipt_text with b64/mimetype
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_vision_path_sends_image_to_llm():
"""In vision mode, the LLM call includes an 'images' key with the b64 data."""
agent = _make_agent()
captured_messages: list = []
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
async def _capture(messages, caller=None):
captured_messages.extend(messages)
return llm_resp
agent._llm.submit = _capture
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
result = await agent._parse_receipt_text(
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Supplies'}],
b64='FAKEBASE64DATA',
mimetype='image/jpeg',
)
assert result['vendor'] == 'Home Depot'
assert result['amount'] == 36.78
assert len(captured_messages) == 1
msg = captured_messages[0]
assert 'images' in msg, "Vision path must include 'images' in LLM message"
assert msg['images'] == ['FAKEBASE64DATA']
@pytest.mark.asyncio
async def test_text_mode_skips_vision_even_with_image():
"""When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
agent = _make_agent()
captured_messages: list = []
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
async def _capture(messages, caller=None):
captured_messages.extend(messages)
return llm_resp
agent._llm.submit = _capture
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
await agent._parse_receipt_text(
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Supplies'}],
b64='FAKEBASE64DATA',
mimetype='image/jpeg',
)
assert len(captured_messages) == 1
assert 'images' not in captured_messages[0], (
"Text mode must NOT send images to the LLM."
)
@pytest.mark.asyncio
async def test_vision_falls_back_to_text_on_llm_error():
"""If the vision LLM call raises, the text path is tried as fallback."""
agent = _make_agent()
call_count = [0]
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
async def _first_fails(messages, caller=None):
call_count[0] += 1
if call_count[0] == 1:
raise RuntimeError('simulated vision model error')
return llm_resp
agent._llm.submit = _first_fails
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
result = await agent._parse_receipt_text(
'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
expense_products=[{'id': 1, 'name': 'Fuel'}],
b64='FAKEBASE64DATA',
mimetype='image/jpeg',
)
assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
assert result['vendor'] == 'Shell'
assert result['amount'] == 55.00
@pytest.mark.asyncio
async def test_non_image_mimetype_uses_text_path_in_vision_mode():
"""PDFs and text files must always use the text path even in vision mode."""
agent = _make_agent()
captured_messages: list = []
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
async def _capture(messages, caller=None):
captured_messages.extend(messages)
return llm_resp
agent._llm.submit = _capture
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
await agent._parse_receipt_text(
'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
expense_products=[{'id': 1, 'name': 'Travel'}],
b64='FAKEBASE64DATA',
mimetype='application/pdf', # NOT an image — no vision
)
assert len(captured_messages) == 1
assert 'images' not in captured_messages[0], (
"PDF receipts must not be sent as images even in vision mode."
)
# ---------------------------------------------------------------------------
# parse_upload — receipt_parser.py
# ---------------------------------------------------------------------------