Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
    return count >= _STMT_AMOUNT_LINE_THRESHOLD


+# Image MIME types the vision LLM can process.  PDF/HTML/TXT use text-only path.
+_VISION_MIMETYPES = frozenset({
+    'image/jpeg', 'image/png', 'image/gif',
+    'image/bmp', 'image/tiff', 'image/webp',
+})
+
+
+def _get_vision_mode() -> str:
+    """Return the configured receipt_vision_mode ('vision' | 'text').
+
+    Wraps get_settings() so tests can patch this single symbol instead of
+    fighting the lru_cache on Settings.  Defaults to 'vision' on any error.
+    """
+    try:
+        from ..config import get_settings
+        return get_settings().receipt_vision_mode
+    except Exception:
+        return 'vision'
+
+
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
            logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
                        r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)

-        # Parse all receipts concurrently
+        # Parse all receipts concurrently.
+        # b64 + mimetype are forwarded so _parse_receipt_text can use the
+        # vision LLM path when RECEIPT_VISION_MODE=vision (the default).
        parse_tasks = [
            self._parse_receipt_text(
                r.get('text', ''), r.get('filename', 'receipt'),
                expense_products=expense_products,
                date_hint=r.get('date_from_name'),
+                b64=r.get('b64'),
+                mimetype=r.get('mimetype'),
            )
            for r in unique_receipts
        ]
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):

    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
-                                   date_hint: str = None) -> dict:
+                                   date_hint: str = None,
+                                   b64: str = None,
+                                   mimetype: str = None) -> dict:
        """Parse a single receipt into structured fields.

        Strategy (most-reliable first):
-          amount      → regex on OCR text (deterministic)
-          date        → filename timestamp > OCR regex > today
-          vendor      → LLM (short excerpt, first ~600 chars)
-          product_name→ LLM (semantic match against expense product list)
+          amount       → regex on OCR text (deterministic, never ask LLM)
+          date         → filename timestamp > OCR regex > today
+          vendor       → vision LLM (image) > text LLM (OCR excerpt) > filename
+          product_name → same LLM call as vendor

-        The LLM is intentionally NOT asked for amount or date — the local
-        model hallucinates those fields when OCR text is ambiguous.
+        Vision mode (RECEIPT_VISION_MODE=vision, default):
+          When the upload is a JPEG/PNG/etc., the raw image is sent to the
+          vision-capable LLM so it can read logos and stylised fonts that
+          Tesseract OCR mangles.  If the vision call fails for any reason
+          (model error, timeout, bad JSON) the text path is used as fallback.
+
+        Text mode (RECEIPT_VISION_MODE=text):
+          Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
+          Set in .env to instantly revert without rebuilding the container.
        """
        today = _date.today().isoformat()
        stripped = (text or '').strip()
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
        else:
            date = today

-        # ── Vendor + Category: LLM (two fields only) ─────────────────────────
+        # ── Vendor + Category: LLM ───────────────────────────────────────────
        vendor = filename
        product_name = ''
        product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))

+        if not product_list:
+            # No expense products configured — nothing to categorise
+            return {'vendor': vendor, 'amount': amount, 'date': date,
+                    'time': None, 'product_name': ''}
+
+        # Shared category guidance used in both prompt paths
+        _cat_guide = (
+            'Guide: restaurant / cafe / fast food / food court → food/meal product; '
+            'airline / airport / transit / taxi / parking / rental car → travel product; '
+            'gas station / petrol / fuel → fuel product; '
+            'hotel / motel / lodging → accommodation product; '
+            'hardware / home improvement / tech / office supply store → supplies product. '
+            'Return "" if nothing fits.'
+        )
+
+        # ── Path A: vision LLM ───────────────────────────────────────────────
+        # Use when: vision mode is enabled AND the file is a supported image type.
+        # The model sees the actual receipt image — no OCR garbling, reads logos
+        # and stylised fonts directly.  Falls through to Path B on any failure.
+        use_vision = (
+            _get_vision_mode() == 'vision'
+            and bool(b64)
+            and mimetype in _VISION_MIMETYPES
+        )
+
+        if use_vision:
+            vision_prompt = (
+                'Return ONLY valid JSON with exactly two keys:\n'
+                '"vendor": the business name printed at the top of this receipt '
+                '(first 1-3 lines; ignore slogans, product item names, '
+                'and payment-processor logos).\n'
+                f'"product_name": pick the single best match from [{product_list}]. '
+                f'{_cat_guide}\n'
+                'JSON only:'
+            )
+            try:
+                resp = await self._llm.submit(
+                    [{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
+                    caller='expenses_agent_receipt_parser',
+                )
+                raw = (resp.content or '').strip()
+                first, last = raw.find('{'), raw.rfind('}')
+                if first != -1 and last > first:
+                    data = json.loads(raw[first:last + 1])
+                    v = str(data.get('vendor', '') or '').strip()
+                    if v:
+                        vendor = v
+                    product_name = str(data.get('product_name', '') or '').strip()
+                logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
+                return {'vendor': vendor, 'amount': amount, 'date': date,
+                        'time': None, 'product_name': product_name}
+            except Exception as exc:
+                logger.warning(
+                    'Vision LLM failed for %s: %s — falling back to text path',
+                    filename, exc,
+                )
+                # Reset vendor so the text path starts fresh
+                vendor = filename
+                product_name = ''
+
+        # ── Path B: text-only (OCR excerpt) ─────────────────────────────────
+        # Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
+        # or the vision call failed.
        if not ocr_failed:
-            # Give LLM only the header of the receipt — vendor is in the first lines
            excerpt = stripped[:600]
-            prompt = (
+            text_prompt = (
                'Return ONLY valid JSON with exactly two keys:\n'
                '"vendor": the business name printed at the TOP of the receipt '
                '(usually the first 1-3 lines). '
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
                'multiple transactions rather than a single merchant receipt, '
                'use "". Use "" if no clear business name is visible.\n'
                f'"product_name": pick the single best match from [{product_list}]. '
-                'Guide: restaurant / cafe / fast food / food court → food/meal product; '
-                'airline / airport / transit / taxi / parking / rental car → travel product; '
-                'gas station / petrol / fuel → fuel product; '
-                'hotel / motel / lodging → accommodation product; '
-                'hardware / home improvement / tech / office supply store → supplies product. '
-                'Return "" if nothing fits.\n\n'
+                f'{_cat_guide}\n\n'
                f'Receipt text:\n{excerpt}\n\nJSON only:'
            )
-        elif product_list:
-            # OCR failed — guess category from filename only
-            prompt = (
+        else:
+            # OCR failed entirely — guess category from filename only
+            text_prompt = (
                f'A receipt file named "{filename}" could not be read. '
                f'Pick the most likely match from [{product_list}] based on the filename, '
                f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
            )
-        else:
-            return {'vendor': filename, 'amount': amount, 'date': date,
-                    'time': None, 'product_name': ''}

        try:
            resp = await self._llm.submit(
-                [{'role': 'user', 'content': prompt}],
+                [{'role': 'user', 'content': text_prompt}],
                caller='expenses_agent_receipt_parser',
            )
            raw = (resp.content or '').strip()
--- a/agent_service/config.py
+++ b/agent_service/config.py
@@ -50,6 +50,11 @@ class Settings(BaseSettings):
    postgres_min_connections: int = 2
    postgres_max_connections: int = 10

+    # Receipt OCR / vision
+    # 'vision' — use vision LLM for vendor+category when an image is uploaded (default)
+    # 'text'   — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert)
+    receipt_vision_mode: str = 'vision'
+
    # Rate limiting
    dispatch_rate_limit_per_user: int = 30  # requests per minute
    directive_timeout_minutes: int = 10