From ec6b41943ff02e9ee0c39750bc6b3e0dc2d0c0f6 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Wed, 20 May 2026 22:24:50 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20vision=20OCR=20JSON=20failures=20?= =?UTF-8?q?=E2=80=94=20add=20format=3D'json'=20and=20repair=20fallback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three receipts per batch were failing with JSONDecodeError (e.g. "Expecting ':' delimiter: line 1 column 90") because activeblue-chat (llama3.2-vision) occasionally outputs near-JSON with trailing commas, single-quoted strings, or unquoted keys. Two-layer fix: 1. Add format='json' to the Ollama chat call — Ollama JSON mode forces syntactically valid output at the sampler level, eliminating most structural errors. 2. Add _repair_json() fallback that runs on any remaining JSONDecodeError: strips trailing commas, converts single→double quotes, and quotes unquoted keys. If repair succeeds, the result is re-serialised as canonical JSON before being returned. Also re-serialise with json.dumps() on success so the fast path in _parse_receipt_text always receives clean, canonical JSON regardless of whitespace or key ordering in the model's original output. Co-Authored-By: Claude Sonnet 4.6 --- agent_service/tools/receipt_parser.py | 37 +++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index 78fa480..ce69ac9 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -105,11 +105,30 @@ def _ocr_image_vision(data: bytes, filename: str, ollama_url: str, model: str) - Returns empty string on any failure so the caller falls back to Tesseract. """ import json as _json + import re as _re + + def _repair_json(s: str) -> str: + """Fix the most common LLM JSON formatting mistakes. + + Handles: + - trailing commas before } or ] → {"a":1,} becomes {"a":1} + - single-quoted strings → {'a':'b'} becomes {"a":"b"} + - unquoted string keys → {a: "b"} becomes {"a": "b"} + """ + # trailing commas + s = _re.sub(r',\s*([}\]])', r'\1', s) + # single-quoted strings (careful around apostrophes in values) + s = _re.sub(r"'([^']*)'", r'"\1"', s) + # unquoted keys: word characters before a colon + s = _re.sub(r'(?