Fix vendor mis-identification (McDonald's bias), MIA Parking amount, grayscale OCR fallback
- Remove "NeDonald's → McDonald's" from LLM vendor correction examples; the
example was biasing the model to return McDonald's for any ambiguous receipt
(Home Depot, Sergio's/HMSHost). Replace with neutral brand examples and add
an explicit instruction not to substitute a brand name absent from the OCR text.
- Add `net\s*fee` to _TOTAL_RE so MIA Parking kiosk receipts ("net fee: 150.00 USD")
are captured by Pass 1 rather than the max-scan which could pick a larger line.
- Add Step 5b grayscale fallback in receipt_parser: if all binarized PSM attempts
yield < 20 chars, retry OCR on the pre-binarization grayscale image. Fixes
dot-matrix and certain thermal-print fonts destroyed by the 160-threshold.
- Tests: 88 passing (test_net_fee_parking, test_vendor_prompt_does_not_contain_mcdonalds,
test_vendor_prompt_instructs_not_to_guess_absent_brand).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -524,6 +524,18 @@ class TestExtractAmount:
|
||||
text = 'Items 8.49\nCash Tendered 20.00\nChange 11.51'
|
||||
assert _extract_amount_from_text(text) == 8.49
|
||||
|
||||
def test_net_fee_parking(self):
|
||||
# Parking kiosk receipts (e.g. MIA) use "net fee: 150.00 USD" format.
|
||||
# _TOTAL_RE must include "net fee" so Pass 1 catches it and avoids
|
||||
# the max-scan accidentally picking up a larger line like entry/exit fees.
|
||||
text = (
|
||||
'MIAMI AIRPORT PARKING\n'
|
||||
'Entry 05/09 08:00\n'
|
||||
'Exit 05/10 14:30\n'
|
||||
'net fee: 150.00 USD'
|
||||
)
|
||||
assert _extract_amount_from_text(text) == 150.00
|
||||
|
||||
|
||||
class TestBankStatementDetection:
|
||||
def _stmt(self, n: int) -> str:
|
||||
@@ -657,6 +669,72 @@ async def test_parse_ocr_failed_skips_llm_amount():
|
||||
assert result['date'] == '2026-05-10'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vendor_prompt_does_not_contain_mcdonalds():
|
||||
"""The vendor LLM prompt must not reference 'McDonald' as a correction
|
||||
example — it biases the model toward returning McDonald's whenever OCR
|
||||
text is unclear, causing unrelated receipts (Home Depot, HMSHost) to be
|
||||
misidentified as McDonald's.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
captured: list[str] = []
|
||||
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"The Home Depot","product_name":"Supplies"}'
|
||||
|
||||
async def _capture(messages, caller=None):
|
||||
for m in messages:
|
||||
captured.append(m.get('content', ''))
|
||||
return llm_resp
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
await agent._parse_receipt_text(
|
||||
'THE HOME DEPOT\nHow doers get more done\nWAGNER FURNO 300HG 36.78\nVISA USD$ 36.78',
|
||||
'homedepot.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}, {'id': 2, 'name': 'Supplies'}],
|
||||
)
|
||||
|
||||
full_prompt = ' '.join(captured)
|
||||
assert 'McDonald' not in full_prompt, (
|
||||
"Vendor prompt must not contain 'McDonald' — it biases the model toward "
|
||||
"returning McDonald's for any ambiguous receipt."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_vendor_prompt_instructs_not_to_guess_absent_brand():
|
||||
"""Prompt must explicitly tell the LLM not to substitute a brand name that
|
||||
isn't in the OCR text — prevents "default to well-known fast food" behaviour.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
captured: list[str] = []
|
||||
|
||||
llm_resp = MagicMock()
|
||||
llm_resp.content = '{"vendor":"SERGIO\'S MIAMI AIRPORT","product_name":"Meals"}'
|
||||
|
||||
async def _capture(messages, caller=None):
|
||||
for m in messages:
|
||||
captured.append(m.get('content', ''))
|
||||
return llm_resp
|
||||
|
||||
agent._llm.submit = _capture
|
||||
|
||||
await agent._parse_receipt_text(
|
||||
'(((HMSHost ByAvolta\nSERGIO\'S MIAMI AIRPORT\nCHK 9745\nPayment $16.29',
|
||||
'sergios.jpg',
|
||||
expense_products=[{'id': 1, 'name': 'Meals'}],
|
||||
)
|
||||
|
||||
full_prompt = ' '.join(captured)
|
||||
# The prompt should warn the model not to invent brand names
|
||||
assert 'only use a brand name' in full_prompt.lower() or \
|
||||
'do not' in full_prompt.lower() or \
|
||||
'not substitute' in full_prompt.lower(), (
|
||||
"Prompt must instruct the LLM not to substitute a different brand name."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse_upload — receipt_parser.py
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user