Fix vendor mis-identification (McDonald's bias), MIA Parking amount, grayscale OCR fallback

- Remove "NeDonald's → McDonald's" from LLM vendor correction examples; the example was biasing the model to return McDonald's for any ambiguous receipt (Home Depot, Sergio's/HMSHost). Replace with neutral brand examples and add an explicit instruction not to substitute a brand name absent from the OCR text. - Add `net\s*fee` to _TOTAL_RE so MIA Parking kiosk receipts ("net fee: 150.00 USD") are captured by Pass 1 rather than the max-scan which could pick a larger line. - Add Step 5b grayscale fallback in receipt_parser: if all binarized PSM attempts yield < 20 chars, retry OCR on the pre-binarization grayscale image. Fixes dot-matrix and certain thermal-print fonts destroyed by the 160-threshold. - Tests: 88 passing (test_net_fee_parking, test_vendor_prompt_does_not_contain_mcdonalds, test_vendor_prompt_instructs_not_to_guess_absent_brand). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 00:56:45 -04:00
parent ece811cccb
commit db06fede5f
3 changed files with 109 additions and 7 deletions
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -23,7 +23,7 @@ from ..tools.expenses_tools import ExpensesTools
 _TOTAL_RE = re.compile(
    r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
    r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
-    r'sale\s*total|you\s*paid|amount\s*paid|total)'
+    r'sale\s*total|you\s*paid|amount\s*paid|net\s*fee|total)'
    r'(?!\s*tax)'                       # exclude "Total Tax / Total Taxes"
    r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
    re.IGNORECASE,
@@ -551,19 +551,25 @@ class ExpensesAgent(BaseAgent):
            excerpt = stripped[:600]
            prompt = (
                'Return ONLY valid JSON with exactly two keys:\n'
-                '"vendor": the merchant or store name from the receipt header. '
-                'OCR often garbles text — use your knowledge to correct obvious '
-                'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → '
-                '"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). '
+                '"vendor": the business name printed at the TOP of the receipt '
+                '(usually the first 1-3 lines). '
+                'Ignore slogans ("How doers get more done"), product item names, '
+                'and payment-processor logos. '
+                'OCR often substitutes look-alike characters — correct obvious '
+                'errors (e.g. "LRYAL" → "LAYAL", "Subwey" → "Subway", '
+                '"H0ME DEP0T" → "HOME DEPOT", "W4LMART" → "WALMART"). '
+                'IMPORTANT: only use a brand name that is clearly present in the '
+                'text — do NOT substitute a different well-known brand if the '
+                'name is merely unclear. '
                'If this looks like a bank or credit-card statement listing '
                'multiple transactions rather than a single merchant receipt, '
                'use "". Use "" if no clear business name is visible.\n'
                f'"product_name": pick the single best match from [{product_list}]. '
-                'Guide: restaurant / cafe / fast food → food/meal product; '
+                'Guide: restaurant / cafe / fast food / food court → food/meal product; '
                'airline / airport / transit / taxi / parking / rental car → travel product; '
                'gas station / petrol / fuel → fuel product; '
                'hotel / motel / lodging → accommodation product; '
-                'office / tech / hardware store → supplies product. '
+                'hardware / home improvement / tech / office supply store → supplies product. '
                'Return "" if nothing fits.\n\n'
                f'Receipt text:\n{excerpt}\n\nJSON only:'
            )
--- a/agent_service/tools/receipt_parser.py
+++ b/agent_service/tools/receipt_parser.py
@@ -130,6 +130,7 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
        # ── Step 3: Grayscale + contrast ─────────────────────────────────────
        img = ImageOps.grayscale(img)
        img = ImageOps.autocontrast(img)
+        img_gray = img  # save grayscale for fallback — before binarization

        # ── Step 4: Sharpen then binarize ─────────────────────────────────────
        # Sharpen first so edges are crisp before thresholding.
@@ -152,6 +153,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
            except Exception:
                pass

+        # ── Step 5b: Grayscale fallback ───────────────────────────────────────
+        # Binarization at threshold 160 can destroy dot-matrix and certain
+        # thermal-print fonts (e.g. parking kiosk receipts) where character
+        # pixels are close to the threshold and get wiped to white.  If every
+        # binarized attempt failed, retry on the plain grayscale image —
+        # Tesseract handles grey-level input reasonably well for these cases.
+        for psm in (6, 4, 11):
+            try:
+                text = pytesseract.image_to_string(
+                    img_gray, config=f'--oem 3 --psm {psm}').strip()
+                if len(text) >= 20:
+                    logger.debug('Tesseract grayscale fallback %s: psm=%d %d chars',
+                                 filename, psm, len(text))
+                    return text
+            except Exception:
+                pass
+
        logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename)
        return ''
    except ImportError: