Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent
directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of
the OCR text excerpt.  The model can read logos, stylised fonts, and layouts
that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.).

Architecture:
- amount + date: always from Tesseract regex (deterministic, never LLM)
- vendor + category: vision LLM when image available, text LLM as fallback
- Fallthrough: if vision call fails for any reason, text path is tried next
- PDF/TXT/HTML receipts: always use text path (not visual media)

Revert instantly without a rebuild:
  echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env
  docker compose up -d agent-service

config.py: add receipt_vision_mode setting (default 'vision')
expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper,
  dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64
tests: 92 passing — 4 new vision tests, 2 existing prompt tests
  pinned to text mode via _get_vision_mode patch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions

View File

@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
return count >= _STMT_AMOUNT_LINE_THRESHOLD
# Image MIME types the vision LLM can process. PDF/HTML/TXT use text-only path.
_VISION_MIMETYPES = frozenset({
'image/jpeg', 'image/png', 'image/gif',
'image/bmp', 'image/tiff', 'image/webp',
})
def _get_vision_mode() -> str:
"""Return the configured receipt_vision_mode ('vision' | 'text').
Wraps get_settings() so tests can patch this single symbol instead of
fighting the lru_cache on Settings. Defaults to 'vision' on any error.
"""
try:
from ..config import get_settings
return get_settings().receipt_vision_mode
except Exception:
return 'vision'
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
# Parse all receipts concurrently
# Parse all receipts concurrently.
# b64 + mimetype are forwarded so _parse_receipt_text can use the
# vision LLM path when RECEIPT_VISION_MODE=vision (the default).
parse_tasks = [
self._parse_receipt_text(
r.get('text', ''), r.get('filename', 'receipt'),
expense_products=expense_products,
date_hint=r.get('date_from_name'),
b64=r.get('b64'),
mimetype=r.get('mimetype'),
)
for r in unique_receipts
]
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):
async def _parse_receipt_text(self, text: str, filename: str,
expense_products: list = None,
date_hint: str = None) -> dict:
date_hint: str = None,
b64: str = None,
mimetype: str = None) -> dict:
"""Parse a single receipt into structured fields.
Strategy (most-reliable first):
amount → regex on OCR text (deterministic)
date → filename timestamp > OCR regex > today
vendor → LLM (short excerpt, first ~600 chars)
product_name→ LLM (semantic match against expense product list)
amount → regex on OCR text (deterministic, never ask LLM)
date → filename timestamp > OCR regex > today
vendor vision LLM (image) > text LLM (OCR excerpt) > filename
product_name → same LLM call as vendor
The LLM is intentionally NOT asked for amount or date — the local
model hallucinates those fields when OCR text is ambiguous.
Vision mode (RECEIPT_VISION_MODE=vision, default):
When the upload is a JPEG/PNG/etc., the raw image is sent to the
vision-capable LLM so it can read logos and stylised fonts that
Tesseract OCR mangles. If the vision call fails for any reason
(model error, timeout, bad JSON) the text path is used as fallback.
Text mode (RECEIPT_VISION_MODE=text):
Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
Set in .env to instantly revert without rebuilding the container.
"""
today = _date.today().isoformat()
stripped = (text or '').strip()
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
else:
date = today
# ── Vendor + Category: LLM (two fields only) ─────────────────────────
# ── Vendor + Category: LLM ───────────────────────────────────────────
vendor = filename
product_name = ''
product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
if not product_list:
# No expense products configured — nothing to categorise
return {'vendor': vendor, 'amount': amount, 'date': date,
'time': None, 'product_name': ''}
# Shared category guidance used in both prompt paths
_cat_guide = (
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
'airline / airport / transit / taxi / parking / rental car → travel product; '
'gas station / petrol / fuel → fuel product; '
'hotel / motel / lodging → accommodation product; '
'hardware / home improvement / tech / office supply store → supplies product. '
'Return "" if nothing fits.'
)
# ── Path A: vision LLM ───────────────────────────────────────────────
# Use when: vision mode is enabled AND the file is a supported image type.
# The model sees the actual receipt image — no OCR garbling, reads logos
# and stylised fonts directly. Falls through to Path B on any failure.
use_vision = (
_get_vision_mode() == 'vision'
and bool(b64)
and mimetype in _VISION_MIMETYPES
)
if use_vision:
vision_prompt = (
'Return ONLY valid JSON with exactly two keys:\n'
'"vendor": the business name printed at the top of this receipt '
'(first 1-3 lines; ignore slogans, product item names, '
'and payment-processor logos).\n'
f'"product_name": pick the single best match from [{product_list}]. '
f'{_cat_guide}\n'
'JSON only:'
)
try:
resp = await self._llm.submit(
[{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
caller='expenses_agent_receipt_parser',
)
raw = (resp.content or '').strip()
first, last = raw.find('{'), raw.rfind('}')
if first != -1 and last > first:
data = json.loads(raw[first:last + 1])
v = str(data.get('vendor', '') or '').strip()
if v:
vendor = v
product_name = str(data.get('product_name', '') or '').strip()
logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
return {'vendor': vendor, 'amount': amount, 'date': date,
'time': None, 'product_name': product_name}
except Exception as exc:
logger.warning(
'Vision LLM failed for %s: %s — falling back to text path',
filename, exc,
)
# Reset vendor so the text path starts fresh
vendor = filename
product_name = ''
# ── Path B: text-only (OCR excerpt) ─────────────────────────────────
# Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
# or the vision call failed.
if not ocr_failed:
# Give LLM only the header of the receipt — vendor is in the first lines
excerpt = stripped[:600]
prompt = (
text_prompt = (
'Return ONLY valid JSON with exactly two keys:\n'
'"vendor": the business name printed at the TOP of the receipt '
'(usually the first 1-3 lines). '
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
'multiple transactions rather than a single merchant receipt, '
'use "". Use "" if no clear business name is visible.\n'
f'"product_name": pick the single best match from [{product_list}]. '
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
'airline / airport / transit / taxi / parking / rental car → travel product; '
'gas station / petrol / fuel → fuel product; '
'hotel / motel / lodging → accommodation product; '
'hardware / home improvement / tech / office supply store → supplies product. '
'Return "" if nothing fits.\n\n'
f'{_cat_guide}\n\n'
f'Receipt text:\n{excerpt}\n\nJSON only:'
)
elif product_list:
# OCR failed — guess category from filename only
prompt = (
else:
# OCR failed entirely — guess category from filename only
text_prompt = (
f'A receipt file named "{filename}" could not be read. '
f'Pick the most likely match from [{product_list}] based on the filename, '
f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
)
else:
return {'vendor': filename, 'amount': amount, 'date': date,
'time': None, 'product_name': ''}
try:
resp = await self._llm.submit(
[{'role': 'user', 'content': prompt}],
[{'role': 'user', 'content': text_prompt}],
caller='expenses_agent_receipt_parser',
)
raw = (resp.content or '').strip()