Add vision LLM path for receipt vendor/category identification

When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 01:06:55 -04:00
parent db06fede5f
commit a736f3352b
3 changed files with 258 additions and 45 deletions
--- a/tests/test_expenses_agent.py
+++ b/tests/test_expenses_agent.py
@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():

 from agent_service.agents.expenses_agent import (
    _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
-    _MONTH_MAP,
+    _MONTH_MAP, _get_vision_mode,
 )


@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():

@pytest.mark.asyncio
 async def test_vendor_prompt_does_not_contain_mcdonalds():
-    """The vendor LLM prompt must not reference 'McDonald' as a correction
-    example — it biases the model toward returning McDonald's whenever OCR
-    text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
-    misidentified as McDonald's.
+    """The text-path vendor prompt must not reference 'McDonald' — it biases
+    the model toward returning McDonald's whenever OCR text is unclear.
+    Pinned to text mode so vision path (which has its own cleaner prompt) does
+    not interfere.
    """
    agent = _make_agent()
    captured: list[str] = []
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():

    agent._llm.submit = _capture

-    await agent._parse_receipt_text(
-        'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
-        'homedepot.jpg',
-        expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
-    )
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
+        await agent._parse_receipt_text(
+            'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
+            'homedepot.jpg',
+            expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
+        )

    full_prompt = ' '.join(captured)
    assert 'McDonald' not in full_prompt, (
-        "Vendor prompt must not contain 'McDonald' — it biases the model toward "
-        "returning McDonald's for any ambiguous receipt."
+        "Text-path prompt must not contain 'McDonald' — it biases the model."
    )


@pytest.mark.asyncio
 async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
-    """Prompt must explicitly tell the LLM not to substitute a brand name that
-    isn't in the OCR text — prevents "default to well-known fast food" behaviour.
-    """
+    """Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
    agent = _make_agent()
    captured: list[str] = []

@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():

    agent._llm.submit = _capture

-    await agent._parse_receipt_text(
-        '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
-        'sergios.jpg',
-        expense_products=[{'id': 1, 'name': 'Meals'}],
-    )
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
+        await agent._parse_receipt_text(
+            '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
+            'sergios.jpg',
+            expense_products=[{'id': 1, 'name': 'Meals'}],
+        )

    full_prompt = ' '.join(captured)
-    # The prompt should warn the model not to invent brand names
    assert 'only use a brand name' in full_prompt.lower() or \
           'do not' in full_prompt.lower() or \
           'not substitute' in full_prompt.lower(), (
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
    )


+# ---------------------------------------------------------------------------
+# Vision LLM path — _parse_receipt_text with b64/mimetype
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_vision_path_sends_image_to_llm():
+    """In vision mode, the LLM call includes an 'images' key with the b64 data."""
+    agent = _make_agent()
+    captured_messages: list = []
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
+
+    async def _capture(messages, caller=None):
+        captured_messages.extend(messages)
+        return llm_resp
+
+    agent._llm.submit = _capture
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
+        result = await agent._parse_receipt_text(
+            'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
+            expense_products=[{'id': 1, 'name': 'Supplies'}],
+            b64='FAKEBASE64DATA',
+            mimetype='image/jpeg',
+        )
+
+    assert result['vendor'] == 'Home Depot'
+    assert result['amount'] == 36.78
+    assert len(captured_messages) == 1
+    msg = captured_messages[0]
+    assert 'images' in msg, "Vision path must include 'images' in LLM message"
+    assert msg['images'] == ['FAKEBASE64DATA']
+
+
+@pytest.mark.asyncio
+async def test_text_mode_skips_vision_even_with_image():
+    """When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
+    agent = _make_agent()
+    captured_messages: list = []
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
+
+    async def _capture(messages, caller=None):
+        captured_messages.extend(messages)
+        return llm_resp
+
+    agent._llm.submit = _capture
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
+        await agent._parse_receipt_text(
+            'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
+            expense_products=[{'id': 1, 'name': 'Supplies'}],
+            b64='FAKEBASE64DATA',
+            mimetype='image/jpeg',
+        )
+
+    assert len(captured_messages) == 1
+    assert 'images' not in captured_messages[0], (
+        "Text mode must NOT send images to the LLM."
+    )
+
+
+@pytest.mark.asyncio
+async def test_vision_falls_back_to_text_on_llm_error():
+    """If the vision LLM call raises, the text path is tried as fallback."""
+    agent = _make_agent()
+    call_count = [0]
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
+
+    async def _first_fails(messages, caller=None):
+        call_count[0] += 1
+        if call_count[0] == 1:
+            raise RuntimeError('simulated vision model error')
+        return llm_resp
+
+    agent._llm.submit = _first_fails
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
+        result = await agent._parse_receipt_text(
+            'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
+            expense_products=[{'id': 1, 'name': 'Fuel'}],
+            b64='FAKEBASE64DATA',
+            mimetype='image/jpeg',
+        )
+
+    assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
+    assert result['vendor'] == 'Shell'
+    assert result['amount'] == 55.00
+
+
+@pytest.mark.asyncio
+async def test_non_image_mimetype_uses_text_path_in_vision_mode():
+    """PDFs and text files must always use the text path even in vision mode."""
+    agent = _make_agent()
+    captured_messages: list = []
+
+    llm_resp = MagicMock()
+    llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
+
+    async def _capture(messages, caller=None):
+        captured_messages.extend(messages)
+        return llm_resp
+
+    agent._llm.submit = _capture
+
+    with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
+        await agent._parse_receipt_text(
+            'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
+            expense_products=[{'id': 1, 'name': 'Travel'}],
+            b64='FAKEBASE64DATA',
+            mimetype='application/pdf',   # NOT an image — no vision
+        )
+
+    assert len(captured_messages) == 1
+    assert 'images' not in captured_messages[0], (
+        "PDF receipts must not be sent as images even in vision mode."
+    )
+
+
 # ---------------------------------------------------------------------------
 # parse_upload — receipt_parser.py
 # ---------------------------------------------------------------------------