Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
    return count >= _STMT_AMOUNT_LINE_THRESHOLD


+# Image MIME types the vision LLM can process.  PDF/HTML/TXT use text-only path.
+_VISION_MIMETYPES = frozenset({
+    'image/jpeg', 'image/png', 'image/gif',
+    'image/bmp', 'image/tiff', 'image/webp',
+})
+
+
+def _get_vision_mode() -> str:
+    """Return the configured receipt_vision_mode ('vision' | 'text').
+
+    Wraps get_settings() so tests can patch this single symbol instead of
+    fighting the lru_cache on Settings.  Defaults to 'vision' on any error.
+    """
+    try:
+        from ..config import get_settings
+        return get_settings().receipt_vision_mode
+    except Exception:
+        return 'vision'
+
+
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
            logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
                        r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)

-        # Parse all receipts concurrently
+        # Parse all receipts concurrently.
+        # b64 + mimetype are forwarded so _parse_receipt_text can use the
+        # vision LLM path when RECEIPT_VISION_MODE=vision (the default).
        parse_tasks = [
            self._parse_receipt_text(
                r.get('text', ''), r.get('filename', 'receipt'),
                expense_products=expense_products,
                date_hint=r.get('date_from_name'),
+                b64=r.get('b64'),
+                mimetype=r.get('mimetype'),
            )
            for r in unique_receipts
        ]
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):

    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
-                                   date_hint: str = None) -> dict:
+                                   date_hint: str = None,
+                                   b64: str = None,
+                                   mimetype: str = None) -> dict:
        """Parse a single receipt into structured fields.

        Strategy (most-reliable first):
-          amount      → regex on OCR text (deterministic)
-          date        → filename timestamp > OCR regex > today
-          vendor      → LLM (short excerpt, first ~600 chars)
-          product_name→ LLM (semantic match against expense product list)
+          amount       → regex on OCR text (deterministic, never ask LLM)
+          date         → filename timestamp > OCR regex > today
+          vendor       → vision LLM (image) > text LLM (OCR excerpt) > filename
+          product_name → same LLM call as vendor

-        The LLM is intentionally NOT asked for amount or date — the local
-        model hallucinates those fields when OCR text is ambiguous.
+        Vision mode (RECEIPT_VISION_MODE=vision, default):
+          When the upload is a JPEG/PNG/etc., the raw image is sent to the
+          vision-capable LLM so it can read logos and stylised fonts that
+          Tesseract OCR mangles.  If the vision call fails for any reason
+          (model error, timeout, bad JSON) the text path is used as fallback.
+
+        Text mode (RECEIPT_VISION_MODE=text):
+          Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
+          Set in .env to instantly revert without rebuilding the container.
        """
        today = _date.today().isoformat()
        stripped = (text or '').strip()
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
        else:
            date = today

-        # ── Vendor + Category: LLM (two fields only) ─────────────────────────
+        # ── Vendor + Category: LLM ───────────────────────────────────────────
        vendor = filename
        product_name = ''
        product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))

+        if not product_list:
+            # No expense products configured — nothing to categorise
+            return {'vendor': vendor, 'amount': amount, 'date': date,
+                    'time': None, 'product_name': ''}
+
+        # Shared category guidance used in both prompt paths
+        _cat_guide = (
+            'Guide: restaurant / cafe / fast food / food court → food/meal product; '
+            'airline / airport / transit / taxi / parking / rental car → travel product; '
+            'gas station / petrol / fuel → fuel product; '
+            'hotel / motel / lodging → accommodation product; '
+            'hardware / home improvement / tech / office supply store → supplies product. '
+            'Return "" if nothing fits.'
+        )
+
+        # ── Path A: vision LLM ───────────────────────────────────────────────
+        # Use when: vision mode is enabled AND the file is a supported image type.
+        # The model sees the actual receipt image — no OCR garbling, reads logos
+        # and stylised fonts directly.  Falls through to Path B on any failure.
+        use_vision = (
+            _get_vision_mode() == 'vision'
+            and bool(b64)
+            and mimetype in _VISION_MIMETYPES
+        )
+
+        if use_vision:
+            vision_prompt = (
+                'Return ONLY valid JSON with exactly two keys:\n'
+                '"vendor": the business name printed at the top of this receipt '
+                '(first 1-3 lines; ignore slogans, product item names, '
+                'and payment-processor logos).\n'
+                f'"product_name": pick the single best match from [{product_list}]. '
+                f'{_cat_guide}\n'
+                'JSON only:'
+            )
+            try:
+                resp = await self._llm.submit(
+                    [{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
+                    caller='expenses_agent_receipt_parser',
+                )
+                raw = (resp.content or '').strip()
+                first, last = raw.find('{'), raw.rfind('}')
+                if first != -1 and last > first:
+                    data = json.loads(raw[first:last + 1])
+                    v = str(data.get('vendor', '') or '').strip()
+                    if v:
+                        vendor = v
+                    product_name = str(data.get('product_name', '') or '').strip()
+                logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
+                return {'vendor': vendor, 'amount': amount, 'date': date,
+                        'time': None, 'product_name': product_name}
+            except Exception as exc:
+                logger.warning(
+                    'Vision LLM failed for %s: %s — falling back to text path',
+                    filename, exc,
+                )
+                # Reset vendor so the text path starts fresh
+                vendor = filename
+                product_name = ''
+
+        # ── Path B: text-only (OCR excerpt) ─────────────────────────────────
+        # Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
+        # or the vision call failed.
        if not ocr_failed:
-            # Give LLM only the header of the receipt — vendor is in the first lines
            excerpt = stripped[:600]
-            prompt = (
+            text_prompt = (
                'Return ONLY valid JSON with exactly two keys:\n'
                '"vendor": the business name printed at the TOP of the receipt '
                '(usually the first 1-3 lines). '
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
                'multiple transactions rather than a single merchant receipt, '
                'use "". Use "" if no clear business name is visible.\n'
                f'"product_name": pick the single best match from [{product_list}]. '
-                'Guide: restaurant / cafe / fast food / food court → food/meal product; '
-                'airline / airport / transit / taxi / parking / rental car → travel product; '
-                'gas station / petrol / fuel → fuel product; '
-                'hotel / motel / lodging → accommodation product; '
-                'hardware / home improvement / tech / office supply store → supplies product. '
-                'Return "" if nothing fits.\n\n'
+                f'{_cat_guide}\n\n'
                f'Receipt text:\n{excerpt}\n\nJSON only:'
            )
-        elif product_list:
-            # OCR failed — guess category from filename only
-            prompt = (
+        else:
+            # OCR failed entirely — guess category from filename only
+            text_prompt = (
                f'A receipt file named "{filename}" could not be read. '
                f'Pick the most likely match from [{product_list}] based on the filename, '
                f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
            )
-        else:
-            return {'vendor': filename, 'amount': amount, 'date': date,
-                    'time': None, 'product_name': ''}

        try:
            resp = await self._llm.submit(
-                [{'role': 'user', 'content': prompt}],
+                [{'role': 'user', 'content': text_prompt}],
                caller='expenses_agent_receipt_parser',
            )
            raw = (resp.content or '').strip()
--- a/agent_service/config.py
+++ b/agent_service/config.py
@@ -50,6 +50,11 @@ class Settings(BaseSettings):
    postgres_min_connections: int = 2
    postgres_max_connections: int = 10

+    # Receipt OCR / vision
+    # 'vision' — use vision LLM for vendor+category when an image is uploaded (default)
+    # 'text'   — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert)
+    receipt_vision_mode: str = 'vision'
+
    # Rate limiting
    dispatch_rate_limit_per_user: int = 30  # requests per minute
    directive_timeout_minutes: int = 10
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():

 from agent_service.agents.expenses_agent import (
    _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
-    _MONTH_MAP,
+    _MONTH_MAP, _get_vision_mode,
 )


@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():

@pytest.mark.asyncio
 async def test_vendor_prompt_does_not_contain_mcdonalds():
-    """The vendor LLM prompt must not reference 'McDonald' as a correction
-    example — it biases the model toward returning McDonald's whenever OCR
-    text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
-    misidentified as McDonald's.
+    """The text-path vendor prompt must not reference 'McDonald' — it biases
+    the model toward returning McDonald's whenever OCR text is unclear.
+    Pinned to text mode so vision path (which has its own cleaner prompt) does
+    not interfere.
    """
    agent = _make_agent()
    captured: list[str] = []
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():

    agent._llm.submit = _capture

-    await agent._parse_receipt_text(
-        'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
-        'homedepot.jpg',
-        expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
-    )
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
+        await agent._parse_receipt_text(
+            'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
+            'homedepot.jpg',
+            expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
+        )

    full_prompt = ' '.join(captured)
    assert 'McDonald' not in full_prompt, (
-        "Vendor prompt must not contain 'McDonald' — it biases the model toward "
-        "returning McDonald's for any ambiguous receipt."
+        "Text-path prompt must not contain 'McDonald' — it biases the model."
    )


@pytest.mark.asyncio
 async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
-    """Prompt must explicitly tell the LLM not to substitute a brand name that
-    isn't in the OCR text — prevents "default to well-known fast food" behaviour.
-    """
+    """Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
    agent = _make_agent()
    captured: list[str] = []

@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():

    agent._llm.submit = _capture

-    await agent._parse_receipt_text(
-        '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
-        'sergios.jpg',
-        expense_products=[{'id': 1, 'name': 'Meals'}],
-    )
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
+        await agent._parse_receipt_text(
+            '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
+            'sergios.jpg',
+            expense_products=[{'id': 1, 'name': 'Meals'}],
+        )

    full_prompt = ' '.join(captured)
-    # The prompt should warn the model not to invent brand names
    assert 'only use a brand name' in full_prompt.lower() or \
           'do not' in full_prompt.lower() or \
           'not substitute' in full_prompt.lower(), (
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
    )


+# ---------------------------------------------------------------------------
+# Vision LLM path — _parse_receipt_text with b64/mimetype
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_vision_path_sends_image_to_llm():
+    """In vision mode, the LLM call includes an 'images' key with the b64 data."""
+    agent = _make_agent()
+    captured_messages: list = []
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
+
+    async def _capture(messages, caller=None):
+        captured_messages.extend(messages)
+        return llm_resp
+
+    agent._llm.submit = _capture
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
+        result = await agent._parse_receipt_text(
+            'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
+            expense_products=[{'id': 1, 'name': 'Supplies'}],
+            b64='FAKEBASE64DATA',
+            mimetype='image/jpeg',
+        )
+
+    assert result['vendor'] == 'Home Depot'
+    assert result['amount'] == 36.78
+    assert len(captured_messages) == 1
+    msg = captured_messages[0]
+    assert 'images' in msg, "Vision path must include 'images' in LLM message"
+    assert msg['images'] == ['FAKEBASE64DATA']
+
+
+@pytest.mark.asyncio
+async def test_text_mode_skips_vision_even_with_image():
+    """When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
+    agent = _make_agent()
+    captured_messages: list = []
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
+
+    async def _capture(messages, caller=None):
+        captured_messages.extend(messages)
+        return llm_resp
+
+    agent._llm.submit = _capture
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
+        await agent._parse_receipt_text(
+            'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
+            expense_products=[{'id': 1, 'name': 'Supplies'}],
+            b64='FAKEBASE64DATA',
+            mimetype='image/jpeg',
+        )
+
+    assert len(captured_messages) == 1
+    assert 'images' not in captured_messages[0], (
+        "Text mode must NOT send images to the LLM."
+    )
+
+
+@pytest.mark.asyncio
+async def test_vision_falls_back_to_text_on_llm_error():
+    """If the vision LLM call raises, the text path is tried as fallback."""
+    agent = _make_agent()
+    call_count = [0]
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
+
+    async def _first_fails(messages, caller=None):
+        call_count[0] += 1
+        if call_count[0] == 1:
+            raise RuntimeError('simulated vision model error')
+        return llm_resp
+
+    agent._llm.submit = _first_fails
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
+        result = await agent._parse_receipt_text(
+            'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
+            expense_products=[{'id': 1, 'name': 'Fuel'}],
+            b64='FAKEBASE64DATA',
+            mimetype='image/jpeg',
+        )
+
+    assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
+    assert result['vendor'] == 'Shell'
+    assert result['amount'] == 55.00
+
+
+@pytest.mark.asyncio
+async def test_non_image_mimetype_uses_text_path_in_vision_mode():
+    """PDFs and text files must always use the text path even in vision mode."""
+    agent = _make_agent()
+    captured_messages: list = []
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
+
+    async def _capture(messages, caller=None):
+        captured_messages.extend(messages)
+        return llm_resp
+
+    agent._llm.submit = _capture
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
+        await agent._parse_receipt_text(
+            'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
+            expense_products=[{'id': 1, 'name': 'Travel'}],
+            b64='FAKEBASE64DATA',
+            mimetype='application/pdf',   # NOT an image — no vision
+        )
+
+    assert len(captured_messages) == 1
+    assert 'images' not in captured_messages[0], (
+        "PDF receipts must not be sent as images even in vision mode."
+    )
+
+
 # ---------------------------------------------------------------------------
 # parse_upload — receipt_parser.py
 # ---------------------------------------------------------------------------