Add vision LLM path for receipt vendor/category identification
When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
|
||||
return count >= _STMT_AMOUNT_LINE_THRESHOLD
|
||||
|
||||
|
||||
# Image MIME types the vision LLM can process. PDF/HTML/TXT use text-only path.
|
||||
_VISION_MIMETYPES = frozenset({
|
||||
'image/jpeg', 'image/png', 'image/gif',
|
||||
'image/bmp', 'image/tiff', 'image/webp',
|
||||
})
|
||||
|
||||
|
||||
def _get_vision_mode() -> str:
|
||||
"""Return the configured receipt_vision_mode ('vision' | 'text').
|
||||
|
||||
Wraps get_settings() so tests can patch this single symbol instead of
|
||||
fighting the lru_cache on Settings. Defaults to 'vision' on any error.
|
||||
"""
|
||||
try:
|
||||
from ..config import get_settings
|
||||
return get_settings().receipt_vision_mode
|
||||
except Exception:
|
||||
return 'vision'
|
||||
|
||||
|
||||
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
||||
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
||||
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
||||
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
|
||||
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
|
||||
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
|
||||
|
||||
# Parse all receipts concurrently
|
||||
# Parse all receipts concurrently.
|
||||
# b64 + mimetype are forwarded so _parse_receipt_text can use the
|
||||
# vision LLM path when RECEIPT_VISION_MODE=vision (the default).
|
||||
parse_tasks = [
|
||||
self._parse_receipt_text(
|
||||
r.get('text', ''), r.get('filename', 'receipt'),
|
||||
expense_products=expense_products,
|
||||
date_hint=r.get('date_from_name'),
|
||||
b64=r.get('b64'),
|
||||
mimetype=r.get('mimetype'),
|
||||
)
|
||||
for r in unique_receipts
|
||||
]
|
||||
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):
|
||||
|
||||
async def _parse_receipt_text(self, text: str, filename: str,
|
||||
expense_products: list = None,
|
||||
date_hint: str = None) -> dict:
|
||||
date_hint: str = None,
|
||||
b64: str = None,
|
||||
mimetype: str = None) -> dict:
|
||||
"""Parse a single receipt into structured fields.
|
||||
|
||||
Strategy (most-reliable first):
|
||||
amount → regex on OCR text (deterministic)
|
||||
date → filename timestamp > OCR regex > today
|
||||
vendor → LLM (short excerpt, first ~600 chars)
|
||||
product_name→ LLM (semantic match against expense product list)
|
||||
amount → regex on OCR text (deterministic, never ask LLM)
|
||||
date → filename timestamp > OCR regex > today
|
||||
vendor → vision LLM (image) > text LLM (OCR excerpt) > filename
|
||||
product_name → same LLM call as vendor
|
||||
|
||||
The LLM is intentionally NOT asked for amount or date — the local
|
||||
model hallucinates those fields when OCR text is ambiguous.
|
||||
Vision mode (RECEIPT_VISION_MODE=vision, default):
|
||||
When the upload is a JPEG/PNG/etc., the raw image is sent to the
|
||||
vision-capable LLM so it can read logos and stylised fonts that
|
||||
Tesseract OCR mangles. If the vision call fails for any reason
|
||||
(model error, timeout, bad JSON) the text path is used as fallback.
|
||||
|
||||
Text mode (RECEIPT_VISION_MODE=text):
|
||||
Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
|
||||
Set in .env to instantly revert without rebuilding the container.
|
||||
"""
|
||||
today = _date.today().isoformat()
|
||||
stripped = (text or '').strip()
|
||||
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
|
||||
else:
|
||||
date = today
|
||||
|
||||
# ── Vendor + Category: LLM (two fields only) ─────────────────────────
|
||||
# ── Vendor + Category: LLM ───────────────────────────────────────────
|
||||
vendor = filename
|
||||
product_name = ''
|
||||
product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
|
||||
|
||||
if not product_list:
|
||||
# No expense products configured — nothing to categorise
|
||||
return {'vendor': vendor, 'amount': amount, 'date': date,
|
||||
'time': None, 'product_name': ''}
|
||||
|
||||
# Shared category guidance used in both prompt paths
|
||||
_cat_guide = (
|
||||
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
|
||||
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
||||
'gas station / petrol / fuel → fuel product; '
|
||||
'hotel / motel / lodging → accommodation product; '
|
||||
'hardware / home improvement / tech / office supply store → supplies product. '
|
||||
'Return "" if nothing fits.'
|
||||
)
|
||||
|
||||
# ── Path A: vision LLM ───────────────────────────────────────────────
|
||||
# Use when: vision mode is enabled AND the file is a supported image type.
|
||||
# The model sees the actual receipt image — no OCR garbling, reads logos
|
||||
# and stylised fonts directly. Falls through to Path B on any failure.
|
||||
use_vision = (
|
||||
_get_vision_mode() == 'vision'
|
||||
and bool(b64)
|
||||
and mimetype in _VISION_MIMETYPES
|
||||
)
|
||||
|
||||
if use_vision:
|
||||
vision_prompt = (
|
||||
'Return ONLY valid JSON with exactly two keys:\n'
|
||||
'"vendor": the business name printed at the top of this receipt '
|
||||
'(first 1-3 lines; ignore slogans, product item names, '
|
||||
'and payment-processor logos).\n'
|
||||
f'"product_name": pick the single best match from [{product_list}]. '
|
||||
f'{_cat_guide}\n'
|
||||
'JSON only:'
|
||||
)
|
||||
try:
|
||||
resp = await self._llm.submit(
|
||||
[{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
|
||||
caller='expenses_agent_receipt_parser',
|
||||
)
|
||||
raw = (resp.content or '').strip()
|
||||
first, last = raw.find('{'), raw.rfind('}')
|
||||
if first != -1 and last > first:
|
||||
data = json.loads(raw[first:last + 1])
|
||||
v = str(data.get('vendor', '') or '').strip()
|
||||
if v:
|
||||
vendor = v
|
||||
product_name = str(data.get('product_name', '') or '').strip()
|
||||
logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
|
||||
return {'vendor': vendor, 'amount': amount, 'date': date,
|
||||
'time': None, 'product_name': product_name}
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
'Vision LLM failed for %s: %s — falling back to text path',
|
||||
filename, exc,
|
||||
)
|
||||
# Reset vendor so the text path starts fresh
|
||||
vendor = filename
|
||||
product_name = ''
|
||||
|
||||
# ── Path B: text-only (OCR excerpt) ─────────────────────────────────
|
||||
# Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
|
||||
# or the vision call failed.
|
||||
if not ocr_failed:
|
||||
# Give LLM only the header of the receipt — vendor is in the first lines
|
||||
excerpt = stripped[:600]
|
||||
prompt = (
|
||||
text_prompt = (
|
||||
'Return ONLY valid JSON with exactly two keys:\n'
|
||||
'"vendor": the business name printed at the TOP of the receipt '
|
||||
'(usually the first 1-3 lines). '
|
||||
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
|
||||
'multiple transactions rather than a single merchant receipt, '
|
||||
'use "". Use "" if no clear business name is visible.\n'
|
||||
f'"product_name": pick the single best match from [{product_list}]. '
|
||||
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
|
||||
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
||||
'gas station / petrol / fuel → fuel product; '
|
||||
'hotel / motel / lodging → accommodation product; '
|
||||
'hardware / home improvement / tech / office supply store → supplies product. '
|
||||
'Return "" if nothing fits.\n\n'
|
||||
f'{_cat_guide}\n\n'
|
||||
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
||||
)
|
||||
elif product_list:
|
||||
# OCR failed — guess category from filename only
|
||||
prompt = (
|
||||
else:
|
||||
# OCR failed entirely — guess category from filename only
|
||||
text_prompt = (
|
||||
f'A receipt file named "{filename}" could not be read. '
|
||||
f'Pick the most likely match from [{product_list}] based on the filename, '
|
||||
f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
|
||||
)
|
||||
else:
|
||||
return {'vendor': filename, 'amount': amount, 'date': date,
|
||||
'time': None, 'product_name': ''}
|
||||
|
||||
try:
|
||||
resp = await self._llm.submit(
|
||||
[{'role': 'user', 'content': prompt}],
|
||||
[{'role': 'user', 'content': text_prompt}],
|
||||
caller='expenses_agent_receipt_parser',
|
||||
)
|
||||
raw = (resp.content or '').strip()
|
||||
|
||||
@@ -50,6 +50,11 @@ class Settings(BaseSettings):
|
||||
postgres_min_connections: int = 2
|
||||
postgres_max_connections: int = 10
|
||||
|
||||
# Receipt OCR / vision
|
||||
# 'vision' — use vision LLM for vendor+category when an image is uploaded (default)
|
||||
# 'text' — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert)
|
||||
receipt_vision_mode: str = 'vision'
|
||||
|
||||
# Rate limiting
|
||||
dispatch_rate_limit_per_user: int = 30 # requests per minute
|
||||
directive_timeout_minutes: int = 10
|
||||
|
||||
Reference in New Issue
Block a user