From db06fede5f7932277e5f81b6ffc120b0f8b8c097 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Thu, 21 May 2026 00:56:45 -0400 Subject: [PATCH] Fix vendor mis-identification (McDonald's bias), MIA Parking amount, grayscale OCR fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove "NeDonald's → McDonald's" from LLM vendor correction examples; the example was biasing the model to return McDonald's for any ambiguous receipt (Home Depot, Sergio's/HMSHost). Replace with neutral brand examples and add an explicit instruction not to substitute a brand name absent from the OCR text. - Add `net\s*fee` to _TOTAL_RE so MIA Parking kiosk receipts ("net fee: 150.00 USD") are captured by Pass 1 rather than the max-scan which could pick a larger line. - Add Step 5b grayscale fallback in receipt_parser: if all binarized PSM attempts yield < 20 chars, retry OCR on the pre-binarization grayscale image. Fixes dot-matrix and certain thermal-print fonts destroyed by the 160-threshold. - Tests: 88 passing (test_net_fee_parking, test_vendor_prompt_does_not_contain_mcdonalds, test_vendor_prompt_instructs_not_to_guess_absent_brand). Co-Authored-By: Claude Sonnet 4.6 --- agent_service/agents/expenses_agent.py | 20 ++++--- agent_service/tools/receipt_parser.py | 18 ++++++ tests/test_expenses_agent.py | 78 ++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 7 deletions(-) diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 3eca3ec..f1a81fd 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -23,7 +23,7 @@ from ..tools.expenses_tools import ExpensesTools _TOTAL_RE = re.compile( r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|' r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|' - r'sale\s*total|you\s*paid|amount\s*paid|total)' + r'sale\s*total|you\s*paid|amount\s*paid|net\s*fee|total)' r'(?!\s*tax)' # exclude "Total Tax / Total Taxes" r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})', re.IGNORECASE, @@ -551,19 +551,25 @@ class ExpensesAgent(BaseAgent): excerpt = stripped[:600] prompt = ( 'Return ONLY valid JSON with exactly two keys:\n' - '"vendor": the merchant or store name from the receipt header. ' - 'OCR often garbles text — use your knowledge to correct obvious ' - 'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → ' - '"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). ' + '"vendor": the business name printed at the TOP of the receipt ' + '(usually the first 1-3 lines). ' + 'Ignore slogans ("How doers get more done"), product item names, ' + 'and payment-processor logos. ' + 'OCR often substitutes look-alike characters — correct obvious ' + 'errors (e.g. "LRYAL" → "LAYAL", "Subwey" → "Subway", ' + '"H0ME DEP0T" → "HOME DEPOT", "W4LMART" → "WALMART"). ' + 'IMPORTANT: only use a brand name that is clearly present in the ' + 'text — do NOT substitute a different well-known brand if the ' + 'name is merely unclear. ' 'If this looks like a bank or credit-card statement listing ' 'multiple transactions rather than a single merchant receipt, ' 'use "". Use "" if no clear business name is visible.\n' f'"product_name": pick the single best match from [{product_list}]. ' - 'Guide: restaurant / cafe / fast food → food/meal product; ' + 'Guide: restaurant / cafe / fast food / food court → food/meal product; ' 'airline / airport / transit / taxi / parking / rental car → travel product; ' 'gas station / petrol / fuel → fuel product; ' 'hotel / motel / lodging → accommodation product; ' - 'office / tech / hardware store → supplies product. ' + 'hardware / home improvement / tech / office supply store → supplies product. ' 'Return "" if nothing fits.\n\n' f'Receipt text:\n{excerpt}\n\nJSON only:' ) diff --git a/agent_service/tools/receipt_parser.py b/agent_service/tools/receipt_parser.py index e366e95..1e26e50 100644 --- a/agent_service/tools/receipt_parser.py +++ b/agent_service/tools/receipt_parser.py @@ -130,6 +130,7 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str: # ── Step 3: Grayscale + contrast ───────────────────────────────────── img = ImageOps.grayscale(img) img = ImageOps.autocontrast(img) + img_gray = img # save grayscale for fallback — before binarization # ── Step 4: Sharpen then binarize ───────────────────────────────────── # Sharpen first so edges are crisp before thresholding. @@ -152,6 +153,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str: except Exception: pass + # ── Step 5b: Grayscale fallback ─────────────────────────────────────── + # Binarization at threshold 160 can destroy dot-matrix and certain + # thermal-print fonts (e.g. parking kiosk receipts) where character + # pixels are close to the threshold and get wiped to white. If every + # binarized attempt failed, retry on the plain grayscale image — + # Tesseract handles grey-level input reasonably well for these cases. + for psm in (6, 4, 11): + try: + text = pytesseract.image_to_string( + img_gray, config=f'--oem 3 --psm {psm}').strip() + if len(text) >= 20: + logger.debug('Tesseract grayscale fallback %s: psm=%d %d chars', + filename, psm, len(text)) + return text + except Exception: + pass + logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename) return '' except ImportError: diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index 8f45a8b..e54386a 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -524,6 +524,18 @@ class TestExtractAmount: text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51' assert _extract_amount_from_text(text) == 8.49 + def test_net_fee_parking(self): + # Parking kiosk receipts (e.g. MIA) use "net fee: 150.00 USD" format. + # _TOTAL_RE must include "net fee" so Pass 1 catches it and avoids + # the max-scan accidentally picking up a larger line like entry/exit fees. + text = ( + 'MIAMI AIRPORT PARKING\n' + 'Entry 05/09 08:00\n' + 'Exit 05/10 14:30\n' + 'net fee: 150.00 USD' + ) + assert _extract_amount_from_text(text) == 150.00 + class TestBankStatementDetection: def _stmt(self, n: int) -> str: @@ -657,6 +669,72 @@ async def test_parse_ocr_failed_skips_llm_amount(): assert result['date'] == '2026-05-10' +@pytest.mark.asyncio +async def test_vendor_prompt_does_not_contain_mcdonalds(): + """The vendor LLM prompt must not reference 'McDonald' as a correction + example — it biases the model toward returning McDonald's whenever OCR + text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be + misidentified as McDonald's. + """ + agent = _make_agent() + captured: list[str] = [] + + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"The Home Depot","product_name":"Supplies"}' + + async def _capture(messages, caller=None): + for m in messages: + captured.append(m.get('content', '')) + return llm_resp + + agent._llm.submit = _capture + + await agent._parse_receipt_text( + 'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78', + 'homedepot.jpg', + expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}], + ) + + full_prompt = ' '.join(captured) + assert 'McDonald' not in full_prompt, ( + "Vendor prompt must not contain 'McDonald' — it biases the model toward " + "returning McDonald's for any ambiguous receipt." + ) + + +@pytest.mark.asyncio +async def test_vendor_prompt_instructs_not_to_guess_absent_brand(): + """Prompt must explicitly tell the LLM not to substitute a brand name that + isn't in the OCR text — prevents "default to well-known fast food" behaviour. + """ + agent = _make_agent() + captured: list[str] = [] + + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"SERGIO\'S MIAMI AIRPORT","product_name":"Meals"}' + + async def _capture(messages, caller=None): + for m in messages: + captured.append(m.get('content', '')) + return llm_resp + + agent._llm.submit = _capture + + await agent._parse_receipt_text( + '(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29', + 'sergios.jpg', + expense_products=[{'id': 1, 'name': 'Meals'}], + ) + + full_prompt = ' '.join(captured) + # The prompt should warn the model not to invent brand names + assert 'only use a brand name' in full_prompt.lower() or \ + 'do not' in full_prompt.lower() or \ + 'not substitute' in full_prompt.lower(), ( + "Prompt must instruct the LLM not to substitute a different brand name." + ) + + # --------------------------------------------------------------------------- # parse_upload — receipt_parser.py # ---------------------------------------------------------------------------