diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index f1a81fd..ecfbdcd 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -55,6 +55,26 @@ def _is_likely_bank_statement(text: str) -> bool: return count >= _STMT_AMOUNT_LINE_THRESHOLD +# Image MIME types the vision LLM can process. PDF/HTML/TXT use text-only path. +_VISION_MIMETYPES = frozenset({ + 'image/jpeg', 'image/png', 'image/gif', + 'image/bmp', 'image/tiff', 'image/webp', +}) + + +def _get_vision_mode() -> str: + """Return the configured receipt_vision_mode ('vision' | 'text'). + + Wraps get_settings() so tests can patch this single symbol instead of + fighting the lru_cache on Settings. Defaults to 'vision' on any error. + """ + try: + from ..config import get_settings + return get_settings().receipt_vision_mode + except Exception: + return 'vision' + + _DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD _DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY _DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY @@ -323,12 +343,16 @@ class ExpensesAgent(BaseAgent): logger.info('ocr filename=%r date_hint=%r ocr_len=%d text_preview=%r', r.get('filename'), r.get('date_from_name'), ocr_len, ocr_preview) - # Parse all receipts concurrently + # Parse all receipts concurrently. + # b64 + mimetype are forwarded so _parse_receipt_text can use the + # vision LLM path when RECEIPT_VISION_MODE=vision (the default). parse_tasks = [ self._parse_receipt_text( r.get('text', ''), r.get('filename', 'receipt'), expense_products=expense_products, date_hint=r.get('date_from_name'), + b64=r.get('b64'), + mimetype=r.get('mimetype'), ) for r in unique_receipts ] @@ -500,17 +524,26 @@ class ExpensesAgent(BaseAgent): async def _parse_receipt_text(self, text: str, filename: str, expense_products: list = None, - date_hint: str = None) -> dict: + date_hint: str = None, + b64: str = None, + mimetype: str = None) -> dict: """Parse a single receipt into structured fields. Strategy (most-reliable first): - amount → regex on OCR text (deterministic) - date → filename timestamp > OCR regex > today - vendor → LLM (short excerpt, first ~600 chars) - product_name→ LLM (semantic match against expense product list) + amount → regex on OCR text (deterministic, never ask LLM) + date → filename timestamp > OCR regex > today + vendor → vision LLM (image) > text LLM (OCR excerpt) > filename + product_name → same LLM call as vendor - The LLM is intentionally NOT asked for amount or date — the local - model hallucinates those fields when OCR text is ambiguous. + Vision mode (RECEIPT_VISION_MODE=vision, default): + When the upload is a JPEG/PNG/etc., the raw image is sent to the + vision-capable LLM so it can read logos and stylised fonts that + Tesseract OCR mangles. If the vision call fails for any reason + (model error, timeout, bad JSON) the text path is used as fallback. + + Text mode (RECEIPT_VISION_MODE=text): + Classic behaviour — only Tesseract OCR text is forwarded to the LLM. + Set in .env to instantly revert without rebuilding the container. """ today = _date.today().isoformat() stripped = (text or '').strip() @@ -541,15 +574,77 @@ class ExpensesAgent(BaseAgent): else: date = today - # ── Vendor + Category: LLM (two fields only) ───────────────────────── + # ── Vendor + Category: LLM ─────────────────────────────────────────── vendor = filename product_name = '' product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or [])) + if not product_list: + # No expense products configured — nothing to categorise + return {'vendor': vendor, 'amount': amount, 'date': date, + 'time': None, 'product_name': ''} + + # Shared category guidance used in both prompt paths + _cat_guide = ( + 'Guide: restaurant / cafe / fast food / food court → food/meal product; ' + 'airline / airport / transit / taxi / parking / rental car → travel product; ' + 'gas station / petrol / fuel → fuel product; ' + 'hotel / motel / lodging → accommodation product; ' + 'hardware / home improvement / tech / office supply store → supplies product. ' + 'Return "" if nothing fits.' + ) + + # ── Path A: vision LLM ─────────────────────────────────────────────── + # Use when: vision mode is enabled AND the file is a supported image type. + # The model sees the actual receipt image — no OCR garbling, reads logos + # and stylised fonts directly. Falls through to Path B on any failure. + use_vision = ( + _get_vision_mode() == 'vision' + and bool(b64) + and mimetype in _VISION_MIMETYPES + ) + + if use_vision: + vision_prompt = ( + 'Return ONLY valid JSON with exactly two keys:\n' + '"vendor": the business name printed at the top of this receipt ' + '(first 1-3 lines; ignore slogans, product item names, ' + 'and payment-processor logos).\n' + f'"product_name": pick the single best match from [{product_list}]. ' + f'{_cat_guide}\n' + 'JSON only:' + ) + try: + resp = await self._llm.submit( + [{'role': 'user', 'content': vision_prompt, 'images': [b64]}], + caller='expenses_agent_receipt_parser', + ) + raw = (resp.content or '').strip() + first, last = raw.find('{'), raw.rfind('}') + if first != -1 and last > first: + data = json.loads(raw[first:last + 1]) + v = str(data.get('vendor', '') or '').strip() + if v: + vendor = v + product_name = str(data.get('product_name', '') or '').strip() + logger.debug('vision vendor=%r product=%r for %s', vendor, product_name, filename) + return {'vendor': vendor, 'amount': amount, 'date': date, + 'time': None, 'product_name': product_name} + except Exception as exc: + logger.warning( + 'Vision LLM failed for %s: %s — falling back to text path', + filename, exc, + ) + # Reset vendor so the text path starts fresh + vendor = filename + product_name = '' + + # ── Path B: text-only (OCR excerpt) ───────────────────────────────── + # Used when: vision mode is off, mimetype is not an image (PDF/TXT/HTML), + # or the vision call failed. if not ocr_failed: - # Give LLM only the header of the receipt — vendor is in the first lines excerpt = stripped[:600] - prompt = ( + text_prompt = ( 'Return ONLY valid JSON with exactly two keys:\n' '"vendor": the business name printed at the TOP of the receipt ' '(usually the first 1-3 lines). ' @@ -565,28 +660,20 @@ class ExpensesAgent(BaseAgent): 'multiple transactions rather than a single merchant receipt, ' 'use "". Use "" if no clear business name is visible.\n' f'"product_name": pick the single best match from [{product_list}]. ' - 'Guide: restaurant / cafe / fast food / food court → food/meal product; ' - 'airline / airport / transit / taxi / parking / rental car → travel product; ' - 'gas station / petrol / fuel → fuel product; ' - 'hotel / motel / lodging → accommodation product; ' - 'hardware / home improvement / tech / office supply store → supplies product. ' - 'Return "" if nothing fits.\n\n' + f'{_cat_guide}\n\n' f'Receipt text:\n{excerpt}\n\nJSON only:' ) - elif product_list: - # OCR failed — guess category from filename only - prompt = ( + else: + # OCR failed entirely — guess category from filename only + text_prompt = ( f'A receipt file named "{filename}" could not be read. ' f'Pick the most likely match from [{product_list}] based on the filename, ' f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}' ) - else: - return {'vendor': filename, 'amount': amount, 'date': date, - 'time': None, 'product_name': ''} try: resp = await self._llm.submit( - [{'role': 'user', 'content': prompt}], + [{'role': 'user', 'content': text_prompt}], caller='expenses_agent_receipt_parser', ) raw = (resp.content or '').strip() diff --git a/agent_service/config.py b/agent_service/config.py index 3a11b0d..6791b83 100644 --- a/agent_service/config.py +++ b/agent_service/config.py @@ -50,6 +50,11 @@ class Settings(BaseSettings): postgres_min_connections: int = 2 postgres_max_connections: int = 10 + # Receipt OCR / vision + # 'vision' — use vision LLM for vendor+category when an image is uploaded (default) + # 'text' — use Tesseract OCR text only (set RECEIPT_VISION_MODE=text to revert) + receipt_vision_mode: str = 'vision' + # Rate limiting dispatch_rate_limit_per_user: int = 30 # requests per minute directive_timeout_minutes: int = 10 diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index e54386a..240093d 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -428,7 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates(): from agent_service.agents.expenses_agent import ( _extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement, - _MONTH_MAP, + _MONTH_MAP, _get_vision_mode, ) @@ -671,10 +671,10 @@ async def test_parse_ocr_failed_skips_llm_amount(): @pytest.mark.asyncio async def test_vendor_prompt_does_not_contain_mcdonalds(): - """The vendor LLM prompt must not reference 'McDonald' as a correction - example — it biases the model toward returning McDonald's whenever OCR - text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be - misidentified as McDonald's. + """The text-path vendor prompt must not reference 'McDonald' — it biases + the model toward returning McDonald's whenever OCR text is unclear. + Pinned to text mode so vision path (which has its own cleaner prompt) does + not interfere. """ agent = _make_agent() captured: list[str] = [] @@ -689,24 +689,22 @@ async def test_vendor_prompt_does_not_contain_mcdonalds(): agent._llm.submit = _capture - await agent._parse_receipt_text( - 'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78', - 'homedepot.jpg', - expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}], - ) + with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'): + await agent._parse_receipt_text( + 'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78', + 'homedepot.jpg', + expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}], + ) full_prompt = ' '.join(captured) assert 'McDonald' not in full_prompt, ( - "Vendor prompt must not contain 'McDonald' — it biases the model toward " - "returning McDonald's for any ambiguous receipt." + "Text-path prompt must not contain 'McDonald' — it biases the model." ) @pytest.mark.asyncio async def test_vendor_prompt_instructs_not_to_guess_absent_brand(): - """Prompt must explicitly tell the LLM not to substitute a brand name that - isn't in the OCR text — prevents "default to well-known fast food" behaviour. - """ + """Text-path prompt must tell LLM not to substitute a brand not in the OCR text.""" agent = _make_agent() captured: list[str] = [] @@ -720,14 +718,14 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand(): agent._llm.submit = _capture - await agent._parse_receipt_text( - '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29', - 'sergios.jpg', - expense_products=[{'id': 1, 'name': 'Meals'}], - ) + with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'): + await agent._parse_receipt_text( + '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29', + 'sergios.jpg', + expense_products=[{'id': 1, 'name': 'Meals'}], + ) full_prompt = ' '.join(captured) - # The prompt should warn the model not to invent brand names assert 'only use a brand name' in full_prompt.lower() or \ 'do not' in full_prompt.lower() or \ 'not substitute' in full_prompt.lower(), ( @@ -735,6 +733,129 @@ async def test_vendor_prompt_instructs_not_to_guess_absent_brand(): ) +# --------------------------------------------------------------------------- +# Vision LLM path — _parse_receipt_text with b64/mimetype +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_vision_path_sends_image_to_llm(): + """In vision mode, the LLM call includes an 'images' key with the b64 data.""" + agent = _make_agent() + captured_messages: list = [] + + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}' + + async def _capture(messages, caller=None): + captured_messages.extend(messages) + return llm_resp + + agent._llm.submit = _capture + + with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'): + result = await agent._parse_receipt_text( + 'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg', + expense_products=[{'id': 1, 'name': 'Supplies'}], + b64='FAKEBASE64DATA', + mimetype='image/jpeg', + ) + + assert result['vendor'] == 'Home Depot' + assert result['amount'] == 36.78 + assert len(captured_messages) == 1 + msg = captured_messages[0] + assert 'images' in msg, "Vision path must include 'images' in LLM message" + assert msg['images'] == ['FAKEBASE64DATA'] + + +@pytest.mark.asyncio +async def test_text_mode_skips_vision_even_with_image(): + """When RECEIPT_VISION_MODE=text, b64 is ignored and no images are sent.""" + agent = _make_agent() + captured_messages: list = [] + + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"Home Depot","product_name":"Supplies"}' + + async def _capture(messages, caller=None): + captured_messages.extend(messages) + return llm_resp + + agent._llm.submit = _capture + + with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='text'): + await agent._parse_receipt_text( + 'THE HOME DEPOT\nTotal: $36.78', 'homedepot.jpg', + expense_products=[{'id': 1, 'name': 'Supplies'}], + b64='FAKEBASE64DATA', + mimetype='image/jpeg', + ) + + assert len(captured_messages) == 1 + assert 'images' not in captured_messages[0], ( + "Text mode must NOT send images to the LLM." + ) + + +@pytest.mark.asyncio +async def test_vision_falls_back_to_text_on_llm_error(): + """If the vision LLM call raises, the text path is tried as fallback.""" + agent = _make_agent() + call_count = [0] + + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}' + + async def _first_fails(messages, caller=None): + call_count[0] += 1 + if call_count[0] == 1: + raise RuntimeError('simulated vision model error') + return llm_resp + + agent._llm.submit = _first_fails + + with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'): + result = await agent._parse_receipt_text( + 'SHELL GAS STATION\nTotal Sale $55.00', 'shell.jpg', + expense_products=[{'id': 1, 'name': 'Fuel'}], + b64='FAKEBASE64DATA', + mimetype='image/jpeg', + ) + + assert call_count[0] == 2, "Must make exactly 2 LLM calls (vision failed, text succeeded)" + assert result['vendor'] == 'Shell' + assert result['amount'] == 55.00 + + +@pytest.mark.asyncio +async def test_non_image_mimetype_uses_text_path_in_vision_mode(): + """PDFs and text files must always use the text path even in vision mode.""" + agent = _make_agent() + captured_messages: list = [] + + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"United Airlines","product_name":"Travel"}' + + async def _capture(messages, caller=None): + captured_messages.extend(messages) + return llm_resp + + agent._llm.submit = _capture + + with patch('agent_service.agents.expenses_agent._get_vision_mode', return_value='vision'): + await agent._parse_receipt_text( + 'United Airlines\nBaggage Fee\nTotal: $45.00', 'ticket.pdf', + expense_products=[{'id': 1, 'name': 'Travel'}], + b64='FAKEBASE64DATA', + mimetype='application/pdf', # NOT an image — no vision + ) + + assert len(captured_messages) == 1 + assert 'images' not in captured_messages[0], ( + "PDF receipts must not be sent as images even in vision mode." + ) + + # --------------------------------------------------------------------------- # parse_upload — receipt_parser.py # ---------------------------------------------------------------------------