fix: vision OCR receipt extraction — skip second LLM call, fix total truncation

receipt_parser: change _ocr_image_vision() to extract structured JSON {vendor,amount,date,time,category} directly from the image instead of transcribing raw text, so the downstream LLM extraction step is unnecessary and the two-step error-compounding is eliminated. expenses_agent: add _match_category() helper to map vision category labels to expense product names via substring/fuzzy match; add fast path in _parse_receipt_text() that detects pre-extracted vision JSON (text starts with '{') and skips the second LLM submit call entirely. Fix text[:2000] truncation that discarded receipt totals — now keeps first 1500 + last 1500 chars of long receipts so the grand total at the bottom is always included. tests: fix stale test_act_enters_awaiting_confirmation_on_first_pass (confirmation gate was removed); add TestMatchCategory and three new tests for the vision JSON fast path and LLM fallthrough. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 21:49:31 -04:00
parent 7a0aad3f37
commit 11cc261923
3 changed files with 209 additions and 23 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -317,13 +317,63 @@ class ExpensesAgent(BaseAgent):

        return None

+    @staticmethod
+    def _match_category(category: str, expense_products: list) -> str:
+        """Map a vision-model category label to the nearest expense product name.
+
+        Tries exact/substring match first, then a fuzzy SequenceMatcher pass.
+        Returns empty string when no reasonable match is found.
+        """
+        if not expense_products or not category:
+            return ''
+        cat = category.lower().strip()
+        # Exact or substring match
+        for p in expense_products:
+            name = p['name'].lower()
+            if cat == name or cat in name or name in cat:
+                return p['name']
+        # Fuzzy fallback (ratio >= 0.4)
+        names_lower = [p['name'].lower() for p in expense_products]
+        matches = difflib.get_close_matches(cat, names_lower, n=1, cutoff=0.4)
+        if matches:
+            for p in expense_products:
+                if p['name'].lower() == matches[0]:
+                    return p['name']
+        return ''
+
    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
                                   date_hint: str = None) -> dict:
        today = _date.today().isoformat()
        fallback = {'vendor': filename, 'amount': 0.0,
                    'date': date_hint or today, 'time': None, 'product_name': ''}
-        ocr_failed = not text or text.startswith('[')
+
+        # ── Fast path: vision model already returned structured JSON ──────────
+        # receipt_parser._ocr_image_vision() returns a JSON string directly
+        # when a vision model is configured.  Skip the second LLM call entirely.
+        stripped = (text or '').strip()
+        if stripped.startswith('{'):
+            try:
+                data = json.loads(stripped)
+                if 'amount' in data:
+                    logger.debug('expenses_agent: using vision pre-extracted JSON for %s', filename)
+                    # Map the vision category label → expense product name
+                    product_name = self._match_category(
+                        data.get('category', ''), expense_products or [])
+                    # Vision model sometimes returns the string "null" instead of JSON null
+                    raw_time = data.get('time')
+                    time_val = None if raw_time in (None, 'null', 'None', '') else str(raw_time)
+                    return {
+                        'vendor': str(data.get('vendor') or filename),
+                        'amount': float(data.get('amount', 0.0)),
+                        'date': str(data.get('date') or date_hint or today),
+                        'time': time_val,
+                        'product_name': product_name,
+                    }
+            except (json.JSONDecodeError, ValueError, TypeError):
+                pass  # not clean JSON — fall through to LLM path
+
+        ocr_failed = not stripped or stripped.startswith('[')

        product_list = ''
        if expense_products:
@@ -341,6 +391,13 @@ class ExpensesAgent(BaseAgent):
                f'Return ONLY valid JSON: {{"product_name": "..."}}'
            )
        else:
+            # Keep both the header (vendor/date) and footer (totals) of the receipt.
+            # A plain [:N] cut discards the bottom of long receipts where the grand
+            # total lives — the primary cause of amount=0 extraction errors.
+            if len(stripped) > 3000:
+                receipt_text = stripped[:1500] + '\n[...]\n' + stripped[-1500:]
+            else:
+                receipt_text = stripped
            prompt = (
                'Extract expense details from the following receipt text. '
                'Return ONLY valid JSON with these keys:\n'
@@ -354,7 +411,7 @@ class ExpensesAgent(BaseAgent):
                '"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
                'null if not present),\n'
                f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
-                f'Receipt text:\n{text[:2000]}\n\nJSON only:'
+                f'Receipt text:\n{receipt_text}\n\nJSON only:'
            )
        try:
            resp = await self._llm.submit(