Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
    return count >= _STMT_AMOUNT_LINE_THRESHOLD
 # Image MIME types the vision LLM can process.  PDF/HTML/TXT use text-only path.
 _VISION_MIMETYPES = frozenset({
    'image/jpeg', 'image/png', 'image/gif',
    'image/bmp', 'image/tiff', 'image/webp',
 })
 def _get_vision_mode() -> str:
    """Return the configured receipt_vision_mode ('vision' | 'text').
    Wraps get_settings() so tests can patch this single symbol instead of
    fighting the lru_cache on Settings.  Defaults to 'vision' on any error.
    """
    try:
        from ..config import get_settings
        return get_settings().receipt_vision_mode
    except Exception:
        return 'vision'
 _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b')   # YYYY-MM-DD or YYYY/MM/DD
 _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b')  # M/D/YYYY
 _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b')  # M/D/YY
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
            logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
                        r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
-        # Parse all receipts concurrently
+        # Parse all receipts concurrently.
        # b64 + mimetype are forwarded so _parse_receipt_text can use the
        # vision LLM path when RECEIPT_VISION_MODE=vision (the default).
        parse_tasks = [
            self._parse_receipt_text(
                r.get('text', ''), r.get('filename', 'receipt'),
                expense_products=expense_products,
                date_hint=r.get('date_from_name'),
                b64=r.get('b64'),
                mimetype=r.get('mimetype'),
            )
            for r in unique_receipts
        ]
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):
    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
-                                   date_hint: str = None) -> dict:
+                                   date_hint: str = None,
                                   b64: str = None,
                                   mimetype: str = None) -> dict:
        """Parse a single receipt into structured fields.
        Strategy (most-reliable first):
-          amount      → regex on OCR text (deterministic)
+          amount       → regex on OCR text (deterministic, never ask LLM)
-          date        → filename timestamp > OCR regex > today
+          date         → filename timestamp > OCR regex > today
-          vendor      → LLM (short excerpt, first ~600 chars)
+          vendor       → vision LLM (image) > text LLM (OCR excerpt) > filename
-          product_name→ LLM (semantic match against expense product list)
+          product_name → same LLM call as vendor
-        The LLM is intentionally NOT asked for amount or date — the local
+        Vision mode (RECEIPT_VISION_MODE=vision, default):
-        model hallucinates those fields when OCR text is ambiguous.
+          When the upload is a JPEG/PNG/etc., the raw image is sent to the
          vision-capable LLM so it can read logos and stylised fonts that
          Tesseract OCR mangles.  If the vision call fails for any reason
          (model error, timeout, bad JSON) the text path is used as fallback.
        Text mode (RECEIPT_VISION_MODE=text):
          Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
          Set in .env to instantly revert without rebuilding the container.
        """
        today = _date.today().isoformat()
        stripped = (text or '').strip()
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
        else:
            date = today
-        # ── Vendor + Category: LLM (two fields only) ─────────────────────────
+        # ── Vendor + Category: LLM ───────────────────────────────────────────
        vendor = filename
        product_name = ''
        product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
        if not product_list:
            # No expense products configured — nothing to categorise
            return {'vendor': vendor, 'amount': amount, 'date': date,
                    'time': None, 'product_name': ''}
        # Shared category guidance used in both prompt paths
        _cat_guide = (
            'Guide: restaurant / cafe / fast food / food court → food/meal product; '
            'airline / airport / transit / taxi / parking / rental car → travel product; '
            'gas station / petrol / fuel → fuel product; '
            'hotel / motel / lodging → accommodation product; '
            'hardware / home improvement / tech / office supply store → supplies product. '
            'Return "" if nothing fits.'
        )
        # ── Path A: vision LLM ───────────────────────────────────────────────
        # Use when: vision mode is enabled AND the file is a supported image type.
        # The model sees the actual receipt image — no OCR garbling, reads logos
        # and stylised fonts directly.  Falls through to Path B on any failure.
        use_vision = (
            _get_vision_mode() == 'vision'
            and bool(b64)
            and mimetype in _VISION_MIMETYPES
        )
        if use_vision:
            vision_prompt = (
                'Return ONLY valid JSON with exactly two keys:\n'
                '"vendor": the business name printed at the top of this receipt '
                '(first 1-3 lines; ignore slogans, product item names, '
                'and payment-processor logos).\n'
                f'"product_name": pick the single best match from [{product_list}]. '
                f'{_cat_guide}\n'
                'JSON only:'
            )
            try:
                resp = await self._llm.submit(
                    [{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
                    caller='expenses_agent_receipt_parser',
                )
                raw = (resp.content or '').strip()
                first, last = raw.find('{'), raw.rfind('}')
                if first != -1 and last > first:
                    data = json.loads(raw[first:last + 1])
                    v = str(data.get('vendor', '') or '').strip()
                    if v:
                        vendor = v
                    product_name = str(data.get('product_name', '') or '').strip()
                logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
                return {'vendor': vendor, 'amount': amount, 'date': date,
                        'time': None, 'product_name': product_name}
            except Exception as exc:
                logger.warning(
                    'Vision LLM failed for %s: %s — falling back to text path',
                    filename, exc,
                )
                # Reset vendor so the text path starts fresh
                vendor = filename
                product_name = ''
        # ── Path B: text-only (OCR excerpt) ─────────────────────────────────
        # Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
        # or the vision call failed.
        if not ocr_failed:
            # Give LLM only the header of the receipt — vendor is in the first lines
            excerpt = stripped[:600]
-            prompt = (
+            text_prompt = (
                'Return ONLY valid JSON with exactly two keys:\n'
                '"vendor": the business name printed at the TOP of the receipt '
                '(usually the first 1-3 lines). '
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
                'multiple transactions rather than a single merchant receipt, '
                'use "". Use "" if no clear business name is visible.\n'
                f'"product_name": pick the single best match from [{product_list}]. '
-                'Guide: restaurant / cafe / fast food / food court → food/meal product; '
+                f'{_cat_guide}\n\n'
                'airline / airport / transit / taxi / parking / rental car → travel product; '
                'gas station / petrol / fuel → fuel product; '
                'hotel / motel / lodging → accommodation product; '
                'hardware / home improvement / tech / office supply store → supplies product. '
                'Return "" if nothing fits.\n\n'
                f'Receipt text:\n{excerpt}\n\nJSON only:'
            )
-        elif product_list:
+        else:
-            # OCR failed — guess category from filename only
+            # OCR failed entirely — guess category from filename only
-            prompt = (
+            text_prompt = (
                f'A receipt file named "{filename}" could not be read. '
                f'Pick the most likely match from [{product_list}] based on the filename, '
                f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
            )
        else:
            return {'vendor': filename, 'amount': amount, 'date': date,
                    'time': None, 'product_name': ''}
        try:
            resp = await self._llm.submit(
-                [{'role': 'user', 'content': prompt}],
+                [{'role': 'user', 'content': text_prompt}],
                caller='expenses_agent_receipt_parser',
            )
            raw = (resp.content or '').strip()
--- a/agent_service/config.py
+++ b/agent_service/config.py
@@ -50,6 +50,11 @@ class Settings(BaseSettings):
    postgres_min_connections: int = 2
    postgres_max_connections: int = 10
    # Receipt OCR / vision
    # 'vision' — use vision LLM for vendor+category when an image is uploaded (default)
    # 'text'   — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert)
    receipt_vision_mode: str = 'vision'
    # Rate limiting
    dispatch_rate_limit_per_user: int = 30  # requests per minute
    directive_timeout_minutes: int = 10
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
 from agent_service.agents.expenses_agent import (
    _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
-    _MONTH_MAP,
+    _MONTH_MAP, _get_vision_mode,
 )
@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():
@pytest.mark.asyncio
 async def test_vendor_prompt_does_not_contain_mcdonalds():
-    """The vendor LLM prompt must not reference 'McDonald' as a correction
+    """The text-path vendor prompt must not reference 'McDonald' — it biases
-    example — it biases the model toward returning McDonald's whenever OCR
+    the model toward returning McDonald's whenever OCR text is unclear.
-    text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
+    Pinned to text mode so vision path (which has its own cleaner prompt) does
-    misidentified as McDonald's.
+    not interfere.
    """
    agent = _make_agent()
    captured: list[str] = []
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():
    agent._llm.submit = _capture
-    await agent._parse_receipt_text(
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
-        'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
+        await agent._parse_receipt_text(
-        'homedepot.jpg',
+            'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
-        expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
+            'homedepot.jpg',
-    )
+            expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
        )
    full_prompt = ' '.join(captured)
    assert 'McDonald' not in full_prompt, (
-        "Vendor prompt must not contain 'McDonald' — it biases the model toward "
+        "Text-path prompt must not contain 'McDonald' — it biases the model."
        "returning McDonald's for any ambiguous receipt."
    )
@pytest.mark.asyncio
 async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
-    """Prompt must explicitly tell the LLM not to substitute a brand name that
+    """Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
    isn't in the OCR text — prevents "default to well-known fast food" behaviour.
    """
    agent = _make_agent()
    captured: list[str] = []
@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
    agent._llm.submit = _capture
-    await agent._parse_receipt_text(
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
-        '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
+        await agent._parse_receipt_text(
-        'sergios.jpg',
+            '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
-        expense_products=[{'id': 1, 'name': 'Meals'}],
+            'sergios.jpg',
-    )
+            expense_products=[{'id': 1, 'name': 'Meals'}],
        )
    full_prompt = ' '.join(captured)
    # The prompt should warn the model not to invent brand names
    assert 'only use a brand name' in full_prompt.lower() or \
           'do not' in full_prompt.lower() or \
           'not substitute' in full_prompt.lower(), (
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
    )
 # ---------------------------------------------------------------------------
 # Vision LLM path — _parse_receipt_text with b64/mimetype
 # ---------------------------------------------------------------------------
@pytest.mark.asyncio
 async def test_vision_path_sends_image_to_llm():
    """In vision mode, the LLM call includes an 'images' key with the b64 data."""
    agent = _make_agent()
    captured_messages: list = []
    llm_resp = MagicMock()
    llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
    async def _capture(messages, caller=None):
        captured_messages.extend(messages)
        return llm_resp
    agent._llm.submit = _capture
    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
        result = await agent._parse_receipt_text(
            'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
            expense_products=[{'id': 1, 'name': 'Supplies'}],
            b64='FAKEBASE64DATA',
            mimetype='image/jpeg',
        )
    assert result['vendor'] == 'Home Depot'
    assert result['amount'] == 36.78
    assert len(captured_messages) == 1
    msg = captured_messages[0]
    assert 'images' in msg, "Vision path must include 'images' in LLM message"
    assert msg['images'] == ['FAKEBASE64DATA']
@pytest.mark.asyncio
 async def test_text_mode_skips_vision_even_with_image():
    """When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
    agent = _make_agent()
    captured_messages: list = []
    llm_resp = MagicMock()
    llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
    async def _capture(messages, caller=None):
        captured_messages.extend(messages)
        return llm_resp
    agent._llm.submit = _capture
    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
        await agent._parse_receipt_text(
            'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
            expense_products=[{'id': 1, 'name': 'Supplies'}],
            b64='FAKEBASE64DATA',
            mimetype='image/jpeg',
        )
    assert len(captured_messages) == 1
    assert 'images' not in captured_messages[0], (
        "Text mode must NOT send images to the LLM."
    )
@pytest.mark.asyncio
 async def test_vision_falls_back_to_text_on_llm_error():
    """If the vision LLM call raises, the text path is tried as fallback."""
    agent = _make_agent()
    call_count = [0]
    llm_resp = MagicMock()
    llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
    async def _first_fails(messages, caller=None):
        call_count[0] += 1
        if call_count[0] == 1:
            raise RuntimeError('simulated vision model error')
        return llm_resp
    agent._llm.submit = _first_fails
    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
        result = await agent._parse_receipt_text(
            'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
            expense_products=[{'id': 1, 'name': 'Fuel'}],
            b64='FAKEBASE64DATA',
            mimetype='image/jpeg',
        )
    assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
    assert result['vendor'] == 'Shell'
    assert result['amount'] == 55.00
@pytest.mark.asyncio
 async def test_non_image_mimetype_uses_text_path_in_vision_mode():
    """PDFs and text files must always use the text path even in vision mode."""
    agent = _make_agent()
    captured_messages: list = []
    llm_resp = MagicMock()
    llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
    async def _capture(messages, caller=None):
        captured_messages.extend(messages)
        return llm_resp
    agent._llm.submit = _capture
    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
        await agent._parse_receipt_text(
            'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
            expense_products=[{'id': 1, 'name': 'Travel'}],
            b64='FAKEBASE64DATA',
            mimetype='application/pdf',   # NOT an image — no vision
        )
    assert len(captured_messages) == 1
    assert 'images' not in captured_messages[0], (
        "PDF receipts must not be sent as images even in vision mode."
    )
 # ---------------------------------------------------------------------------
 # parse_upload — receipt_parser.py
 # ---------------------------------------------------------------------------