Remove vision OCR — use Tesseract-only pipeline for receipt parsing

The llama3.2-vision model was producing unreliable structured data (wrong vendors, amounts, dates) making expense reports worse than Tesseract + LLM extraction. Removes _ocr_image_vision(), the vision JSON fast path in _parse_receipt_text(), _match_category(), and the vision_ocr_model config setting entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 22:32:26 -04:00
parent ec6b41943f
commit 0320591344
4 changed files with 4 additions and 247 deletions
--- a/agent_service/tools/receipt_parser.py
+++ b/agent_service/tools/receipt_parser.py
@@ -80,121 +80,10 @@ def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:


 def _ocr_image(data: bytes, filename: str) -> str:
-    """Extract text from a receipt image.
-
-    Tries vision-model OCR first when VISION_OCR_MODEL is configured,
-    then falls back to the Tesseract pipeline.
-    """
-    from agent_service.config import get_settings
-    settings = get_settings()
-    if settings.vision_ocr_model:
-        result = _ocr_image_vision(data, filename,
-                                   settings.ollama_url,
-                                   settings.vision_ocr_model)
-        if result:
-            return result
-        logger.warning('Vision OCR returned empty for %s — falling back to Tesseract', filename)
+    """Extract text from a receipt image using Tesseract."""
    return _ocr_image_tesseract(data, filename)


-def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) -> str:
-    """Use an Ollama vision model to extract receipt data directly as JSON.
-
-    Returns a JSON string {vendor, amount, date, time, category} so the
-    expenses agent can skip the second LLM extraction step entirely.
-    Returns empty string on any failure so the caller falls back to Tesseract.
-    """
-    import json as _json
-    import re as _re
-
-    def _repair_json(s: str) -> str:
-        """Fix the most common LLM JSON formatting mistakes.
-
-        Handles:
-        - trailing commas before } or ]  →  {"a":1,}  becomes  {"a":1}
-        - single-quoted strings          →  {'a':'b'}  becomes  {"a":"b"}
-        - unquoted string keys           →  {a: "b"}   becomes  {"a": "b"}
-        """
-        # trailing commas
-        s = _re.sub(r',\s*([}\]])', r'\1', s)
-        # single-quoted strings (careful around apostrophes in values)
-        s = _re.sub(r"'([^']*)'", r'"\1"', s)
-        # unquoted keys: word characters before a colon
-        s = _re.sub(r'(?<!["\w])(\w+)\s*:', r'"\1":', s)
-        return s
-
-    try:
-        import ollama as _ollama
-        client = _ollama.Client(host=ollama_url)
-        response = client.chat(
-            model=model,
-            format='json',   # Ollama JSON mode — forces syntactically valid output
-            messages=[{
-                'role': 'user',
-                'content': (
-                    'You are a receipt data extractor. '
-                    'Read this receipt image and extract the following fields. '
-                    'Copy values EXACTLY as printed — do NOT guess, infer, or '
-                    'invent values you cannot clearly see.\n\n'
-                    'Fields to extract:\n'
-                    '- vendor: the store or restaurant name exactly as printed; '
-                    'empty string if not clearly visible\n'
-                    '- amount: the FINAL total the customer paid; find a line '
-                    'labeled "Total", "Grand Total", "Amount Due", or "Balance Due"; '
-                    'copy the number exactly; do NOT use subtotal, tax, or tip; '
-                    'return 0 if no clearly labeled final total is visible\n'
-                    '- date: transaction date in YYYY-MM-DD format; '
-                    'null if not clearly visible\n'
-                    '- time: transaction time in HH:MM 24-hour format; '
-                    'null if not clearly visible\n'
-                    '- category: one of: meals, fuel, hotel, office, transport, other\n\n'
-                    'Return ONLY a valid JSON object, no commentary, no markdown:\n'
-                    '{"vendor":"...","amount":0.00,"date":"YYYY-MM-DD or null",'
-                    '"time":"HH:MM or null","category":"..."}'
-                ),
-                'images': [data],
-            }],
-        )
-        if isinstance(response, dict):
-            raw = (response.get('message', {}).get('content') or '').strip()
-        else:
-            raw = (response.message.content or '').strip()
-
-        # Must contain a JSON object, not prose
-        first, last = raw.find('{'), raw.rfind('}')
-        if first == -1 or last <= first:
-            logger.warning('Vision OCR %s: model returned prose, falling back to Tesseract',
-                           filename)
-            return ''
-        json_str = raw[first:last + 1]
-
-        # Parse — on failure attempt common repairs then retry once
-        try:
-            parsed = _json.loads(json_str)
-        except _json.JSONDecodeError as json_err:
-            repaired = _repair_json(json_str)
-            try:
-                parsed = _json.loads(repaired)
-                logger.debug('Vision OCR %s: JSON repaired successfully', filename)
-            except _json.JSONDecodeError:
-                logger.warning('Vision OCR %s: JSON parse failed (%s), falling back',
-                               filename, json_err)
-                return ''
-
-        if 'amount' not in parsed:
-            logger.warning('Vision OCR %s: JSON missing amount field, falling back', filename)
-            return ''
-        logger.debug('Vision OCR %s (%s): extracted JSON ok', filename, model)
-        # Re-serialise so downstream always gets clean, canonical JSON
-        return _json.dumps(parsed)
-    except ImportError:
-        logger.warning('ollama package not installed — vision OCR unavailable for %s', filename)
-        return ''
-    except Exception as exc:
-        logger.warning('Vision OCR failed for %s: %s', filename, exc)
-        return ''
-
-
 def _ocr_image_tesseract(data: bytes, filename: str) -> str:
    """Tesseract-based OCR pipeline (fallback)."""
    try: