Fix vendor mis-identification (McDonald's bias), MIA Parking amount, grayscale OCR fallback
- Remove "NeDonald's → McDonald's" from LLM vendor correction examples; the
example was biasing the model to return McDonald's for any ambiguous receipt
(Home Depot, Sergio's/HMSHost). Replace with neutral brand examples and add
an explicit instruction not to substitute a brand name absent from the OCR text.
- Add `net\s*fee` to _TOTAL_RE so MIA Parking kiosk receipts ("net fee: 150.00 USD")
are captured by Pass 1 rather than the max-scan which could pick a larger line.
- Add Step 5b grayscale fallback in receipt_parser: if all binarized PSM attempts
yield < 20 chars, retry OCR on the pre-binarization grayscale image. Fixes
dot-matrix and certain thermal-print fonts destroyed by the 160-threshold.
- Tests: 88 passing (test_net_fee_parking, test_vendor_prompt_does_not_contain_mcdonalds,
test_vendor_prompt_instructs_not_to_guess_absent_brand).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,7 +23,7 @@ from ..tools.expenses_tools import ExpensesTools
|
|||||||
_TOTAL_RE = re.compile(
|
_TOTAL_RE = re.compile(
|
||||||
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
|
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
|
||||||
r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
|
r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
|
||||||
r'sale\s*total|you\s*paid|amount\s*paid|total)'
|
r'sale\s*total|you\s*paid|amount\s*paid|net\s*fee|total)'
|
||||||
r'(?!\s*tax)' # exclude "Total Tax / Total Taxes"
|
r'(?!\s*tax)' # exclude "Total Tax / Total Taxes"
|
||||||
r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
|
r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
@@ -551,19 +551,25 @@ class ExpensesAgent(BaseAgent):
|
|||||||
excerpt = stripped[:600]
|
excerpt = stripped[:600]
|
||||||
prompt = (
|
prompt = (
|
||||||
'Return ONLY valid JSON with exactly two keys:\n'
|
'Return ONLY valid JSON with exactly two keys:\n'
|
||||||
'"vendor": the merchant or store name from the receipt header. '
|
'"vendor": the business name printed at the TOP of the receipt '
|
||||||
'OCR often garbles text — use your knowledge to correct obvious '
|
'(usually the first 1-3 lines). '
|
||||||
'errors (e.g. "NeDonald\'s" → "McDonald\'s", "TN-N-QUT" → '
|
'Ignore slogans ("How doers get more done"), product item names, '
|
||||||
'"IN-N-OUT Burger", "Subwey" → "Subway", "LRYAL" → "LAYAL"). '
|
'and payment-processor logos. '
|
||||||
|
'OCR often substitutes look-alike characters — correct obvious '
|
||||||
|
'errors (e.g. "LRYAL" → "LAYAL", "Subwey" → "Subway", '
|
||||||
|
'"H0ME DEP0T" → "HOME DEPOT", "W4LMART" → "WALMART"). '
|
||||||
|
'IMPORTANT: only use a brand name that is clearly present in the '
|
||||||
|
'text — do NOT substitute a different well-known brand if the '
|
||||||
|
'name is merely unclear. '
|
||||||
'If this looks like a bank or credit-card statement listing '
|
'If this looks like a bank or credit-card statement listing '
|
||||||
'multiple transactions rather than a single merchant receipt, '
|
'multiple transactions rather than a single merchant receipt, '
|
||||||
'use "". Use "" if no clear business name is visible.\n'
|
'use "". Use "" if no clear business name is visible.\n'
|
||||||
f'"product_name": pick the single best match from [{product_list}]. '
|
f'"product_name": pick the single best match from [{product_list}]. '
|
||||||
'Guide: restaurant / cafe / fast food → food/meal product; '
|
'Guide: restaurant / cafe / fast food / food court → food/meal product; '
|
||||||
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
'airline / airport / transit / taxi / parking / rental car → travel product; '
|
||||||
'gas station / petrol / fuel → fuel product; '
|
'gas station / petrol / fuel → fuel product; '
|
||||||
'hotel / motel / lodging → accommodation product; '
|
'hotel / motel / lodging → accommodation product; '
|
||||||
'office / tech / hardware store → supplies product. '
|
'hardware / home improvement / tech / office supply store → supplies product. '
|
||||||
'Return "" if nothing fits.\n\n'
|
'Return "" if nothing fits.\n\n'
|
||||||
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
f'Receipt text:\n{excerpt}\n\nJSON only:'
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -130,6 +130,7 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
|||||||
# ── Step 3: Grayscale + contrast ─────────────────────────────────────
|
# ── Step 3: Grayscale + contrast ─────────────────────────────────────
|
||||||
img = ImageOps.grayscale(img)
|
img = ImageOps.grayscale(img)
|
||||||
img = ImageOps.autocontrast(img)
|
img = ImageOps.autocontrast(img)
|
||||||
|
img_gray = img # save grayscale for fallback — before binarization
|
||||||
|
|
||||||
# ── Step 4: Sharpen then binarize ─────────────────────────────────────
|
# ── Step 4: Sharpen then binarize ─────────────────────────────────────
|
||||||
# Sharpen first so edges are crisp before thresholding.
|
# Sharpen first so edges are crisp before thresholding.
|
||||||
@@ -152,6 +153,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# ── Step 5b: Grayscale fallback ───────────────────────────────────────
|
||||||
|
# Binarization at threshold 160 can destroy dot-matrix and certain
|
||||||
|
# thermal-print fonts (e.g. parking kiosk receipts) where character
|
||||||
|
# pixels are close to the threshold and get wiped to white. If every
|
||||||
|
# binarized attempt failed, retry on the plain grayscale image —
|
||||||
|
# Tesseract handles grey-level input reasonably well for these cases.
|
||||||
|
for psm in (6, 4, 11):
|
||||||
|
try:
|
||||||
|
text = pytesseract.image_to_string(
|
||||||
|
img_gray, config=f'--oem 3 --psm {psm}').strip()
|
||||||
|
if len(text) >= 20:
|
||||||
|
logger.debug('Tesseract grayscale fallback %s: psm=%d %d chars',
|
||||||
|
filename, psm, len(text))
|
||||||
|
return text
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename)
|
logger.warning('Tesseract OCR %s: all PSM modes returned < 20 chars', filename)
|
||||||
return ''
|
return ''
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|||||||
@@ -524,6 +524,18 @@ class TestExtractAmount:
|
|||||||
text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51'
|
text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51'
|
||||||
assert _extract_amount_from_text(text) == 8.49
|
assert _extract_amount_from_text(text) == 8.49
|
||||||
|
|
||||||
|
def test_net_fee_parking(self):
|
||||||
|
# Parking kiosk receipts (e.g. MIA) use "net fee: 150.00 USD" format.
|
||||||
|
# _TOTAL_RE must include "net fee" so Pass 1 catches it and avoids
|
||||||
|
# the max-scan accidentally picking up a larger line like entry/exit fees.
|
||||||
|
text = (
|
||||||
|
'MIAMI AIRPORT PARKING\n'
|
||||||
|
'Entry 05/09 08:00\n'
|
||||||
|
'Exit 05/10 14:30\n'
|
||||||
|
'net fee: 150.00 USD'
|
||||||
|
)
|
||||||
|
assert _extract_amount_from_text(text) == 150.00
|
||||||
|
|
||||||
|
|
||||||
class TestBankStatementDetection:
|
class TestBankStatementDetection:
|
||||||
def _stmt(self, n: int) -> str:
|
def _stmt(self, n: int) -> str:
|
||||||
@@ -657,6 +669,72 @@ async def test_parse_ocr_failed_skips_llm_amount():
|
|||||||
assert result['date'] == '2026-05-10'
|
assert result['date'] == '2026-05-10'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vendor_prompt_does_not_contain_mcdonalds():
|
||||||
|
"""The vendor LLM prompt must not reference 'McDonald' as a correction
|
||||||
|
example — it biases the model toward returning McDonald's whenever OCR
|
||||||
|
text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
|
||||||
|
misidentified as McDonald's.
|
||||||
|
"""
|
||||||
|
agent = _make_agent()
|
||||||
|
captured: list[str] = []
|
||||||
|
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"The Home Depot","product_name":"Supplies"}'
|
||||||
|
|
||||||
|
async def _capture(messages, caller=None):
|
||||||
|
for m in messages:
|
||||||
|
captured.append(m.get('content', ''))
|
||||||
|
return llm_resp
|
||||||
|
|
||||||
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
|
await agent._parse_receipt_text(
|
||||||
|
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
|
||||||
|
'homedepot.jpg',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
|
||||||
|
)
|
||||||
|
|
||||||
|
full_prompt = ' '.join(captured)
|
||||||
|
assert 'McDonald' not in full_prompt, (
|
||||||
|
"Vendor prompt must not contain 'McDonald' — it biases the model toward "
|
||||||
|
"returning McDonald's for any ambiguous receipt."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
||||||
|
"""Prompt must explicitly tell the LLM not to substitute a brand name that
|
||||||
|
isn't in the OCR text — prevents "default to well-known fast food" behaviour.
|
||||||
|
"""
|
||||||
|
agent = _make_agent()
|
||||||
|
captured: list[str] = []
|
||||||
|
|
||||||
|
llm_resp = MagicMock()
|
||||||
|
llm_resp.content = '{"vendor":"SERGIO\'S MIAMI AIRPORT","product_name":"Meals"}'
|
||||||
|
|
||||||
|
async def _capture(messages, caller=None):
|
||||||
|
for m in messages:
|
||||||
|
captured.append(m.get('content', ''))
|
||||||
|
return llm_resp
|
||||||
|
|
||||||
|
agent._llm.submit = _capture
|
||||||
|
|
||||||
|
await agent._parse_receipt_text(
|
||||||
|
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
|
||||||
|
'sergios.jpg',
|
||||||
|
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||||
|
)
|
||||||
|
|
||||||
|
full_prompt = ' '.join(captured)
|
||||||
|
# The prompt should warn the model not to invent brand names
|
||||||
|
assert 'only use a brand name' in full_prompt.lower() or \
|
||||||
|
'do not' in full_prompt.lower() or \
|
||||||
|
'not substitute' in full_prompt.lower(), (
|
||||||
|
"Prompt must instruct the LLM not to substitute a different brand name."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# parse_upload — receipt_parser.py
|
# parse_upload — receipt_parser.py
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user