Add vision LLM path for receipt vendor/category identification
When RECEIPT_VISION_MODE=vision (default), uploaded receipt images are sent directly to the vision-capable LLM (llama3.2-vision via Ollama) instead of the OCR text excerpt. The model can read logos, stylised fonts, and layouts that Tesseract OCR mangles (Home Depot, HMSHost/Sergio's, etc.). Architecture: - amount + date: always from Tesseract regex (deterministic, never LLM) - vendor + category: vision LLM when image available, text LLM as fallback - Fallthrough: if vision call fails for any reason, text path is tried next - PDF/TXT/HTML receipts: always use text path (not visual media) Revert instantly without a rebuild: echo "RECEIPT_VISION_MODE=text" >> /root/odoo/odoo-ai/.env docker compose up -d agent-service config.py: add receipt_vision_mode setting (default 'vision') expenses_agent.py: _VISION_MIMETYPES, _get_vision_mode() helper, dual-path _parse_receipt_text (b64/mimetype params), _act() passes b64 tests: 92 passing — 4 new vision tests, 2 existing prompt tests pinned to text mode via _get_vision_mode patch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool:
|
|||||||
return count >= _STMT_AMOUNT_LINE_THRESHOLD
|
return count >= _STMT_AMOUNT_LINE_THRESHOLD
|
||||||
|
|
||||||
|
|
||||||
|
# Image MIME types the vision LLM can process. PDF/HTML/TXT use text-only path.
|
||||||
|
_VISION_MIMETYPES = frozenset({
|
||||||
|
'image/jpeg', 'image/png', 'image/gif',
|
||||||
|
'image/bmp', 'image/tiff', 'image/webp',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _get_vision_mode() -> str:
|
||||||
|
"""Return the configured receipt_vision_mode ('vision' | 'text').
|
||||||
|
|
||||||
|
Wraps get_settings() so tests can patch this single symbol instead of
|
||||||
|
fighting the lru_cache on Settings. Defaults to 'vision' on any error.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from ..config import get_settings
|
||||||
|
return get_settings().receipt_vision_mode
|
||||||
|
except Exception:
|
||||||
|
return 'vision'
|
||||||
|
|
||||||
|
|
||||||
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
||||||
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
||||||
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
||||||
@@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent):
|
|||||||
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
|
logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r',
|
||||||
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
|
r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview)
|
||||||
|
|
||||||
# Parse all receipts concurrently
|
# Parse all receipts concurrently.
|
||||||
|
# b64 + mimetype are forwarded so _parse_receipt_text can use the
|
||||||
|
# vision LLM path when RECEIPT_VISION_MODE=vision (the default).
|
||||||
parse_tasks = [
|
parse_tasks = [
|
||||||
self._parse_receipt_text(
|
self._parse_receipt_text(
|
||||||
r.get('text', ''), r.get('filename', 'receipt'),
|
r.get('text', ''), r.get('filename', 'receipt'),
|
||||||
expense_products=expense_products,
|
expense_products=expense_products,
|
||||||
date_hint=r.get('date_from_name'),
|
date_hint=r.get('date_from_name'),
|
||||||
|
b64=r.get('b64'),
|
||||||
|
mimetype=r.get('mimetype'),
|
||||||
)
|
)
|
||||||
for r in unique_receipts
|
for r in unique_receipts
|
||||||
]
|
]
|
||||||
@@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent):
|
|||||||
|
|
||||||
async def _parse_receipt_text(self, text: str, filename: str,
|
async def _parse_receipt_text(self, text: str, filename: str,
|
||||||
expense_products: list = None,
|
expense_products: list = None,
|
||||||
date_hint: str = None) -> dict:
|
date_hint: str = None,
|
||||||
|
b64: str = None,
|
||||||
|
mimetype: str = None) -> dict:
|
||||||
"""Parse a single receipt into structured fields.
|
"""Parse a single receipt into structured fields.
|
||||||
|
|
||||||
Strategy (most-reliable first):
|
Strategy (most-reliable first):
|
||||||
amount → regex on OCR text (deterministic)
|
amount → regex on OCR text (deterministic, never ask LLM)
|
||||||
date → filename timestamp > OCR regex > today
|
date → filename timestamp > OCR regex > today
|
||||||
vendor → LLM (short excerpt, first ~600 chars)
|
vendor → vision LLM (image) > text LLM (OCR excerpt) > filename
|
||||||
product_name→ LLM (semantic match against expense product list)
|
product_name → same LLM call as vendor
|
||||||
|
|
||||||
The LLM is intentionally NOT asked for amount or date — the local
|
Vision mode (RECEIPT_VISION_MODE=vision, default):
|
||||||
model hallucinates those fields when OCR text is ambiguous.
|
When the upload is a JPEG/PNG/etc., the raw image is sent to the
|
||||||
|
vision-capable LLM so it can read logos and stylised fonts that
|
||||||
|
Tesseract OCR mangles. If the vision call fails for any reason
|
||||||
|
(model error, timeout, bad JSON) the text path is used as fallback.
|
||||||
|
|
||||||
|
Text mode (RECEIPT_VISION_MODE=text):
|
||||||
|
Classic behaviour — only Tesseract OCR text is forwarded to the LLM.
|
||||||
|
Set in .env to instantly revert without rebuilding the container.
|
||||||
"""
|
"""
|
||||||
today = _date.today().isoformat()
|
today = _date.today().isoformat()
|
||||||
stripped = (text or '').strip()
|
stripped = (text or '').strip()
|
||||||
@@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent):
|
|||||||
else:
|
else:
|
||||||
date = today
|
date = today
|
||||||
|
|
||||||
# ── Vendor + Category: LLM (two fields only) ─────────────────────────
|
# ── Vendor + Category: LLM ───────────────────────────────────────────
|
||||||
vendor = filename
|
vendor = filename
|
||||||
product_name = ''
|
product_name = ''
|
||||||
product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
|
product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or []))
|
||||||
|
|
||||||
|
if not product_list:
|
||||||
|
# No expense products configured — nothing to categorise
|
||||||
|
return {'vendor': vendor, 'amount': amount, 'date': date,
|
||||||
|
'time': None, 'product_name': ''}
|
||||||
|
|
||||||
|
# Shared category guidance used in both prompt paths
|
||||||
|
_cat_guide = (
|
||||||
|
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
|
||||||
|
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
||||||
|
'gas station / petrol / fuel → fuel product; '
|
||||||
|
'hotel / motel / lodging → accommodation product; '
|
||||||
|
'hardware / home improvement / tech / office supply store → supplies product. '
|
||||||
|
'Return "" if nothing fits.'
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Path A: vision LLM ───────────────────────────────────────────────
|
||||||
|
# Use when: vision mode is enabled AND the file is a supported image type.
|
||||||
|
# The model sees the actual receipt image — no OCR garbling, reads logos
|
||||||
|
# and stylised fonts directly. Falls through to Path B on any failure.
|
||||||
|
use_vision = (
|
||||||
|
_get_vision_mode() == 'vision'
|
||||||
|
and bool(b64)
|
||||||
|
and mimetype in _VISION_MIMETYPES
|
||||||
|
)
|
||||||
|
|
||||||
|
if use_vision:
|
||||||
|
vision_prompt = (
|
||||||
|
'Return ONLY valid JSON with exactly two keys:\n'
|
||||||
|
'"vendor": the business name printed at the top of this receipt '
|
||||||
|
'(first 1-3 lines; ignore slogans, product item names, '
|
||||||
|
'and payment-processor logos).\n'
|
||||||
|
f'"product_name": pick the single best match from [{product_list}]. '
|
||||||
|
f'{_cat_guide}\n'
|
||||||
|
'JSON only:'
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
resp = await self._llm.submit(
|
||||||
|
[{'role': 'user', 'content': vision_prompt, 'images': [b64]}],
|
||||||
|
caller='expenses_agent_receipt_parser',
|
||||||
|
)
|
||||||
|
raw = (resp.content or '').strip()
|
||||||
|
first, last = raw.find('{'), raw.rfind('}')
|
||||||
|
if first != -1 and last > first:
|
||||||
|
data = json.loads(raw[first:last + 1])
|
||||||
|
v = str(data.get('vendor', '') or '').strip()
|
||||||
|
if v:
|
||||||
|
vendor = v
|
||||||
|
product_name = str(data.get('product_name', '') or '').strip()
|
||||||
|
logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename)
|
||||||
|
return {'vendor': vendor, 'amount': amount, 'date': date,
|
||||||
|
'time': None, 'product_name': product_name}
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
'Vision LLM failed for %s: %s — falling back to text path',
|
||||||
|
filename, exc,
|
||||||
|
)
|
||||||
|
# Reset vendor so the text path starts fresh
|
||||||
|
vendor = filename
|
||||||
|
product_name = ''
|
||||||
|
|
||||||
|
# ── Path B: text-only (OCR excerpt) ─────────────────────────────────
|
||||||
|
# Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML),
|
||||||
|
# or the vision call failed.
|
||||||
if not ocr_failed:
|
if not ocr_failed:
|
||||||
# Give LLM only the header of the receipt — vendor is in the first lines
|
|
||||||
excerpt = stripped[:600]
|
excerpt = stripped[:600]
|
||||||
prompt = (
|
text_prompt = (
|
||||||
'Return ONLY valid JSON with exactly two keys:\n'
|
'Return ONLY valid JSON with exactly two keys:\n'
|
||||||
'"vendor": the business name printed at the TOP of the receipt '
|
'"vendor": the business name printed at the TOP of the receipt '
|
||||||
'(usually the first 1-3 lines). '
|
'(usually the first 1-3 lines). '
|
||||||
@@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent):
|
|||||||
'multiple transactions rather than a single merchant receipt, '
|
'multiple transactions rather than a single merchant receipt, '
|
||||||
'use "". Use "" if no clear business name is visible.\n'
|
'use "". Use "" if no clear business name is visible.\n'
|
||||||
f'"product_name": pick the single best match from [{product_list}]. '
|
f'"product_name": pick the single best match from [{product_list}]. '
|
||||||
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
|
f'{_cat_guide}\n\n'
|
||||||
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
|
||||||
'gas station / petrol / fuel → fuel product; '
|
|
||||||
'hotel / motel / lodging → accommodation product; '
|
|
||||||
'hardware / home improvement / tech / office supply store → supplies product. '
|
|
||||||
'Return "" if nothing fits.\n\n'
|
|
||||||
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
||||||
)
|
)
|
||||||
elif product_list:
|
else:
|
||||||
# OCR failed — guess category from filename only
|
# OCR failed entirely — guess category from filename only
|
||||||
prompt = (
|
text_prompt = (
|
||||||
f'A receipt file named "{filename}" could not be read. '
|
f'A receipt file named "{filename}" could not be read. '
|
||||||
f'Pick the most likely match from [{product_list}] based on the filename, '
|
f'Pick the most likely match from [{product_list}] based on the filename, '
|
||||||
f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
|
f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}'
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
return {'vendor': filename, 'amount': amount, 'date': date,
|
|
||||||
'time': None, 'product_name': ''}
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = await self._llm.submit(
|
resp = await self._llm.submit(
|
||||||
[{'role': 'user', 'content': prompt}],
|
[{'role': 'user', 'content': text_prompt}],
|
||||||
caller='expenses_agent_receipt_parser',
|
caller='expenses_agent_receipt_parser',
|
||||||
)
|
)
|
||||||
raw = (resp.content or '').strip()
|
raw = (resp.content or '').strip()
|
||||||
|
|||||||
@@ -50,6 +50,11 @@ class Settings(BaseSettings):
|
|||||||
postgres_min_connections: int = 2
|
postgres_min_connections: int = 2
|
||||||
postgres_max_connections: int = 10
|
postgres_max_connections: int = 10
|
||||||
|
|
||||||
|
# Receipt OCR / vision
|
||||||
|
# 'vision' — use vision LLM for vendor+category when an image is uploaded (default)
|
||||||
|
# 'text' — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert)
|
||||||
|
receipt_vision_mode: str = 'vision'
|
||||||
|
|
||||||
# Rate limiting
|
# Rate limiting
|
||||||
dispatch_rate_limit_per_user: int = 30 # requests per minute
|
dispatch_rate_limit_per_user: int = 30 # requests per minute
|
||||||
directive_timeout_minutes: int = 10
|
directive_timeout_minutes: int = 10
|
||||||
|
|||||||
@@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
|||||||
|
|
||||||
from agent_service.agents.expenses_agent import (
|
from agent_service.agents.expenses_agent import (
|
||||||
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
|
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
|
||||||
_MONTH_MAP,
|
_MONTH_MAP, _get_vision_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount():
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_vendor_prompt_does_not_contain_mcdonalds():
|
async def test_vendor_prompt_does_not_contain_mcdonalds():
|
||||||
"""The vendor LLM prompt must not reference 'McDonald' as a correction
|
"""The text-path vendor prompt must not reference 'McDonald' — it biases
|
||||||
example — it biases the model toward returning McDonald's whenever OCR
|
the model toward returning McDonald's whenever OCR text is unclear.
|
||||||
text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
|
Pinned to text mode so vision path (which has its own cleaner prompt) does
|
||||||
misidentified as McDonald's.
|
not interfere.
|
||||||
"""
|
"""
|
||||||
agent = _make_agent()
|
agent = _make_agent()
|
||||||
captured: list[str] = []
|
captured: list[str] = []
|
||||||
@@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds():
|
|||||||
|
|
||||||
agent._llm.submit = _capture
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
await agent._parse_receipt_text(
|
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
|
||||||
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
|
await agent._parse_receipt_text(
|
||||||
'homedepot.jpg',
|
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
|
||||||
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
|
'homedepot.jpg',
|
||||||
)
|
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
|
||||||
|
)
|
||||||
|
|
||||||
full_prompt = ' '.join(captured)
|
full_prompt = ' '.join(captured)
|
||||||
assert 'McDonald' not in full_prompt, (
|
assert 'McDonald' not in full_prompt, (
|
||||||
"Vendor prompt must not contain 'McDonald' — it biases the model toward "
|
"Text-path prompt must not contain 'McDonald' — it biases the model."
|
||||||
"returning McDonald's for any ambiguous receipt."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
||||||
"""Prompt must explicitly tell the LLM not to substitute a brand name that
|
"""Text-path prompt must tell LLM not to substitute a brand not in the OCR text."""
|
||||||
isn't in the OCR text — prevents "default to well-known fast food" behaviour.
|
|
||||||
"""
|
|
||||||
agent = _make_agent()
|
agent = _make_agent()
|
||||||
captured: list[str] = []
|
captured: list[str] = []
|
||||||
|
|
||||||
@@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
|||||||
|
|
||||||
agent._llm.submit = _capture
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
await agent._parse_receipt_text(
|
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
|
||||||
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
|
await agent._parse_receipt_text(
|
||||||
'sergios.jpg',
|
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
|
||||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
'sergios.jpg',
|
||||||
)
|
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||||
|
)
|
||||||
|
|
||||||
full_prompt = ' '.join(captured)
|
full_prompt = ' '.join(captured)
|
||||||
# The prompt should warn the model not to invent brand names
|
|
||||||
assert 'only use a brand name' in full_prompt.lower() or \
|
assert 'only use a brand name' in full_prompt.lower() or \
|
||||||
'do not' in full_prompt.lower() or \
|
'do not' in full_prompt.lower() or \
|
||||||
'not substitute' in full_prompt.lower(), (
|
'not substitute' in full_prompt.lower(), (
|
||||||
@@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Vision LLM path — _parse_receipt_text with b64/mimetype
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vision_path_sends_image_to_llm():
|
||||||
|
"""In vision mode, the LLM call includes an 'images' key with the b64 data."""
|
||||||
|
agent = _make_agent()
|
||||||
|
captured_messages: list = []
|
||||||
|
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
|
||||||
|
|
||||||
|
async def _capture(messages, caller=None):
|
||||||
|
captured_messages.extend(messages)
|
||||||
|
return llm_resp
|
||||||
|
|
||||||
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
|
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
|
||||||
|
result = await agent._parse_receipt_text(
|
||||||
|
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Supplies'}],
|
||||||
|
b64='FAKEBASE64DATA',
|
||||||
|
mimetype='image/jpeg',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result['vendor'] == 'Home Depot'
|
||||||
|
assert result['amount'] == 36.78
|
||||||
|
assert len(captured_messages) == 1
|
||||||
|
msg = captured_messages[0]
|
||||||
|
assert 'images' in msg, "Vision path must include 'images' in LLM message"
|
||||||
|
assert msg['images'] == ['FAKEBASE64DATA']
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_text_mode_skips_vision_even_with_image():
|
||||||
|
"""When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent."""
|
||||||
|
agent = _make_agent()
|
||||||
|
captured_messages: list = []
|
||||||
|
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}'
|
||||||
|
|
||||||
|
async def _capture(messages, caller=None):
|
||||||
|
captured_messages.extend(messages)
|
||||||
|
return llm_resp
|
||||||
|
|
||||||
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
|
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'):
|
||||||
|
await agent._parse_receipt_text(
|
||||||
|
'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Supplies'}],
|
||||||
|
b64='FAKEBASE64DATA',
|
||||||
|
mimetype='image/jpeg',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(captured_messages) == 1
|
||||||
|
assert 'images' not in captured_messages[0], (
|
||||||
|
"Text mode must NOT send images to the LLM."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vision_falls_back_to_text_on_llm_error():
|
||||||
|
"""If the vision LLM call raises, the text path is tried as fallback."""
|
||||||
|
agent = _make_agent()
|
||||||
|
call_count = [0]
|
||||||
|
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}'
|
||||||
|
|
||||||
|
async def _first_fails(messages, caller=None):
|
||||||
|
call_count[0] += 1
|
||||||
|
if call_count[0] == 1:
|
||||||
|
raise RuntimeError('simulated vision model error')
|
||||||
|
return llm_resp
|
||||||
|
|
||||||
|
agent._llm.submit = _first_fails
|
||||||
|
|
||||||
|
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
|
||||||
|
result = await agent._parse_receipt_text(
|
||||||
|
'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Fuel'}],
|
||||||
|
b64='FAKEBASE64DATA',
|
||||||
|
mimetype='image/jpeg',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)"
|
||||||
|
assert result['vendor'] == 'Shell'
|
||||||
|
assert result['amount'] == 55.00
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_non_image_mimetype_uses_text_path_in_vision_mode():
|
||||||
|
"""PDFs and text files must always use the text path even in vision mode."""
|
||||||
|
agent = _make_agent()
|
||||||
|
captured_messages: list = []
|
||||||
|
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}'
|
||||||
|
|
||||||
|
async def _capture(messages, caller=None):
|
||||||
|
captured_messages.extend(messages)
|
||||||
|
return llm_resp
|
||||||
|
|
||||||
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
|
with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'):
|
||||||
|
await agent._parse_receipt_text(
|
||||||
|
'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Travel'}],
|
||||||
|
b64='FAKEBASE64DATA',
|
||||||
|
mimetype='application/pdf', # NOT an image — no vision
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(captured_messages) == 1
|
||||||
|
assert 'images' not in captured_messages[0], (
|
||||||
|
"PDF receipts must not be sent as images even in vision mode."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# parse_upload — receipt_parser.py
|
# parse_upload — receipt_parser.py
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user