Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent
directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of
the OCR text excerpt.  The model can read logos, stylised fonts, and layouts
that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.).

Architecture:
- amount + date: always from Tesseract regex (deterministic, never LLM)
- vendor + category: vision LLM when image available, text LLM as fallback
- Fallthrough: if vision call fails for any reason, text path is tried next
- PDF/TXT/HTML receipts: always use text path (not visual media)

Revert instantly without a rebuild:
  echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env
  docker compose up -d agent-service

config.py: add receipt_vision_mode setting (default 'vision')
expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper,
  dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64
tests: 92 passing — 4 new vision tests, 2 existing prompt tests
  pinned to text mode via _get_vision_mode patch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions

View File

@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
return count >= _STMT_AMOUNT_LINE_THRESHOLD
# Image MIME types the vision LLM can process. PDF/HTML/TXT use text-only path.
_VISION_MIMETYPES = frozenset({
'image/jpeg', 'image/png', 'image/gif',
'image/bmp', 'image/tiff', 'image/webp',
})
def _get_vision_mode() -> str:
"""Return the configured receipt_vision_mode ('vision' | 'text').
Wraps get_settings() so tests can patch this single symbol instead of
fighting the lru_cache on Settings. Defaults to 'vision' on any error.
"""
try:
from ..config import get_settings
return get_settings().receipt_vision_mode
except Exception:
return 'vision'
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
# Parse all receipts concurrently
# Parse all receipts concurrently.
# b64 + mimetype are forwarded so _parse_receipt_text can use the
# vision LLM path when RECEIPT_VISION_MODE=vision (the default).
parse_tasks = [
self._parse_receipt_text(
r.get('text', ''), r.get('filename', 'receipt'),
expense_products=expense_products,
date_hint=r.get('date_from_name'),
b64=r.get('b64'),
mimetype=r.get('mimetype'),
)
for r in unique_receipts
]
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):
async def _parse_receipt_text(self, text: str, filename: str,
expense_products: list = None,
date_hint: str = None) -> dict:
date_hint: str = None,
b64: str = None,
mimetype: str = None) -> dict:
"""Parse a single receipt into structured fields.
Strategy (most-reliable first):
amount → regex on OCR text (deterministic)
date → filename timestamp > OCR regex > today
vendor → LLM (short excerpt, first ~600 chars)
product_name→ LLM (semantic match against expense product list)
amount → regex on OCR text (deterministic, never ask LLM)
date → filename timestamp > OCR regex > today
vendor vision LLM (image) > text LLM (OCR excerpt) > filename
product_name → same LLM call as vendor
The LLM is intentionally NOT asked for amount or date — the local
model hallucinates those fields when OCR text is ambiguous.
Vision mode (RECEIPT_VISION_MODE=vision, default):
When the upload is a JPEG/PNG/etc., the raw image is sent to the
vision-capable LLM so it can read logos and stylised fonts that
Tesseract OCR mangles. If the vision call fails for any reason
(model error, timeout, bad JSON) the text path is used as fallback.
Text mode (RECEIPT_VISION_MODE=text):
Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
Set in .env to instantly revert without rebuilding the container.
"""
today = _date.today().isoformat()
stripped = (text or '').strip()
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
else:
date = today
# ── Vendor + Category: LLM (two fields only) ─────────────────────────
# ── Vendor + Category: LLM ───────────────────────────────────────────
vendor = filename
product_name = ''
product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
if not product_list:
# No expense products configured — nothing to categorise
return {'vendor': vendor, 'amount': amount, 'date': date,
'time': None, 'product_name': ''}
# Shared category guidance used in both prompt paths
_cat_guide = (
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
'airline / airport / transit / taxi / parking / rental car → travel product; '
'gas station / petrol / fuel → fuel product; '
'hotel / motel / lodging → accommodation product; '
'hardware / home improvement / tech / office supply store → supplies product. '
'Return "" if nothing fits.'
)
# ── Path A: vision LLM ───────────────────────────────────────────────
# Use when: vision mode is enabled AND the file is a supported image type.
# The model sees the actual receipt image — no OCR garbling, reads logos
# and stylised fonts directly. Falls through to Path B on any failure.
use_vision = (
_get_vision_mode() == 'vision'
and bool(b64)
and mimetype in _VISION_MIMETYPES
)
if use_vision:
vision_prompt = (
'Return ONLY valid JSON with exactly two keys:\n'
'"vendor": the business name printed at the top of this receipt '
'(first 1-3 lines; ignore slogans, product item names, '
'and payment-processor logos).\n'
f'"product_name": pick the single best match from [{product_list}]. '
f'{_cat_guide}\n'
'JSON only:'
)
try:
resp = await self._llm.submit(
[{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
caller='expenses_agent_receipt_parser',
)
raw = (resp.content or '').strip()
first, last = raw.find('{'), raw.rfind('}')
if first != -1 and last > first:
data = json.loads(raw[first:last + 1])
v = str(data.get('vendor', '') or '').strip()
if v:
vendor = v
product_name = str(data.get('product_name', '') or '').strip()
logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
return {'vendor': vendor, 'amount': amount, 'date': date,
'time': None, 'product_name': product_name}
except Exception as exc:
logger.warning(
'Vision LLM failed for %s: %s — falling back to text path',
filename, exc,
)
# Reset vendor so the text path starts fresh
vendor = filename
product_name = ''
# ── Path B: text-only (OCR excerpt) ─────────────────────────────────
# Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
# or the vision call failed.
if not ocr_failed:
# Give LLM only the header of the receipt — vendor is in the first lines
excerpt = stripped[:600]
prompt = (
text_prompt = (
'Return ONLY valid JSON with exactly two keys:\n'
'"vendor": the business name printed at the TOP of the receipt '
'(usually the first 1-3 lines). '
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
'multiple transactions rather than a single merchant receipt, '
'use "". Use "" if no clear business name is visible.\n'
f'"product_name": pick the single best match from [{product_list}]. '
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
'airline / airport / transit / taxi / parking / rental car → travel product; '
'gas station / petrol / fuel → fuel product; '
'hotel / motel / lodging → accommodation product; '
'hardware / home improvement / tech / office supply store → supplies product. '
'Return "" if nothing fits.\n\n'
f'{_cat_guide}\n\n'
f'Receipt text:\n{excerpt}\n\nJSON only:'
)
elif product_list:
# OCR failed — guess category from filename only
prompt = (
else:
# OCR failed entirely — guess category from filename only
text_prompt = (
f'A receipt file named "{filename}" could not be read. '
f'Pick the most likely match from [{product_list}] based on the filename, '
f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
)
else:
return {'vendor': filename, 'amount': amount, 'date': date,
'time': None, 'product_name': ''}
try:
resp = await self._llm.submit(
[{'role': 'user', 'content': prompt}],
[{'role': 'user', 'content': text_prompt}],
caller='expenses_agent_receipt_parser',
)
raw = (resp.content or '').strip()

View File

@@ -50,6 +50,11 @@ class Settings(BaseSettings):
postgres_min_connections: int = 2
postgres_max_connections: int = 10
# Receipt OCR / vision
# 'vision' — use vision LLM for vendor+category when an image is uploaded (default)
# 'text' — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert)
receipt_vision_mode: str = 'vision'
# Rate limiting
dispatch_rate_limit_per_user: int = 30 # requests per minute
directive_timeout_minutes: int = 10

View File

@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
from agent_service.agents.expenses_agent import (
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
_MONTH_MAP,
_MONTH_MAP, _get_vision_mode,
)
@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():
@pytest.mark.asyncio
async def test_vendor_prompt_does_not_contain_mcdonalds():
"""The vendor LLM prompt must not reference 'McDonald' as a correction
example — it biases the model toward returning McDonald's whenever OCR
text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
misidentified as McDonald's.
"""The text-path vendor prompt must not reference 'McDonald' — it biases
the model toward returning McDonald's whenever OCR text is unclear.
Pinned to text mode so vision path (which has its own cleaner prompt) does
not interfere.
"""
agent = _make_agent()
captured: list[str] = []
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():
agent._llm.submit = _capture
await agent._parse_receipt_text(
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
)
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
await agent._parse_receipt_text(
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
)
full_prompt = ' '.join(captured)
assert 'McDonald' not in full_prompt, (
"Vendor prompt must not contain 'McDonald' — it biases the model toward "
"returning McDonald's for any ambiguous receipt."
"Text-path prompt must not contain 'McDonald' — it biases the model."
)
@pytest.mark.asyncio
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
"""Prompt must explicitly tell the LLM not to substitute a brand name that
isn't in the OCR text — prevents "default to well-known fast food" behaviour.
"""
"""Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
agent = _make_agent()
captured: list[str] = []
@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
agent._llm.submit = _capture
await agent._parse_receipt_text(
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
'sergios.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}],
)
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
await agent._parse_receipt_text(
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
'sergios.jpg',
expense_products=[{'id': 1, 'name': 'Meals'}],
)
full_prompt = ' '.join(captured)
# The prompt should warn the model not to invent brand names
assert 'only use a brand name' in full_prompt.lower() or \
'do not' in full_prompt.lower() or \
'not substitute' in full_prompt.lower(), (
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
)
# ---------------------------------------------------------------------------
# Vision LLM path — _parse_receipt_text with b64/mimetype
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_vision_path_sends_image_to_llm():
"""In vision mode, the LLM call includes an 'images' key with the b64 data."""
agent = _make_agent()
captured_messages: list = []
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
async def _capture(messages, caller=None):
captured_messages.extend(messages)
return llm_resp
agent._llm.submit = _capture
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
result = await agent._parse_receipt_text(
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Supplies'}],
b64='FAKEBASE64DATA',
mimetype='image/jpeg',
)
assert result['vendor'] == 'Home Depot'
assert result['amount'] == 36.78
assert len(captured_messages) == 1
msg = captured_messages[0]
assert 'images' in msg, "Vision path must include 'images' in LLM message"
assert msg['images'] == ['FAKEBASE64DATA']
@pytest.mark.asyncio
async def test_text_mode_skips_vision_even_with_image():
"""When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
agent = _make_agent()
captured_messages: list = []
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
async def _capture(messages, caller=None):
captured_messages.extend(messages)
return llm_resp
agent._llm.submit = _capture
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
await agent._parse_receipt_text(
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
expense_products=[{'id': 1, 'name': 'Supplies'}],
b64='FAKEBASE64DATA',
mimetype='image/jpeg',
)
assert len(captured_messages) == 1
assert 'images' not in captured_messages[0], (
"Text mode must NOT send images to the LLM."
)
@pytest.mark.asyncio
async def test_vision_falls_back_to_text_on_llm_error():
"""If the vision LLM call raises, the text path is tried as fallback."""
agent = _make_agent()
call_count = [0]
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
async def _first_fails(messages, caller=None):
call_count[0] += 1
if call_count[0] == 1:
raise RuntimeError('simulated vision model error')
return llm_resp
agent._llm.submit = _first_fails
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
result = await agent._parse_receipt_text(
'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
expense_products=[{'id': 1, 'name': 'Fuel'}],
b64='FAKEBASE64DATA',
mimetype='image/jpeg',
)
assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
assert result['vendor'] == 'Shell'
assert result['amount'] == 55.00
@pytest.mark.asyncio
async def test_non_image_mimetype_uses_text_path_in_vision_mode():
"""PDFs and text files must always use the text path even in vision mode."""
agent = _make_agent()
captured_messages: list = []
llm_resp = MagicMock()
llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
async def _capture(messages, caller=None):
captured_messages.extend(messages)
return llm_resp
agent._llm.submit = _capture
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
await agent._parse_receipt_text(
'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
expense_products=[{'id': 1, 'name': 'Travel'}],
b64='FAKEBASE64DATA',
mimetype='application/pdf', # NOT an image — no vision
)
assert len(captured_messages) == 1
assert 'images' not in captured_messages[0], (
"PDF receipts must not be sent as images even in vision mode."
)
# ---------------------------------------------------------------------------
# parse_upload — receipt_parser.py
# ---------------------------------------------------------------------------