fix(expenses): LAYAL CAFE $2.80 bug, United Airlines rotation & date
LAYAL CAFE ($2.80 instead of $42.90): - Add (?!\s*tax) lookahead to _TOTAL_RE so "Total Taxes $2.80" is never confused with the receipt total when OCR drops the "Taxes" word - Change Pass 1 from matches[-1] to max() so the largest labeled amount always wins, regardless of line order in the OCR output United Airlines (Subway/$0/wrong date): - Add OSD-based rotation correction in receipt_parser.py: after EXIF transpose, ask Tesseract's orientation-detection engine (--psm 0) what angle to rotate; applies to receipts photographed lying sideways where EXIF metadata cannot help - Add month-name date patterns (DD MON YYYY / MON DD YYYY) to _extract_date_from_text for airline/hotel receipts that print dates like "05 MAY 2026" instead of "05/07/26" 85 tests, all passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,10 +15,16 @@ from ..tools.expenses_tools import ExpensesTools
|
||||
# Matches an explicitly labeled total line.
|
||||
# Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46",
|
||||
# "Total Sale $58.75" (gas stations), "Net Sale $X", etc.
|
||||
#
|
||||
# The negative lookahead (?!\s*tax) prevents "Total Tax" / "Total Taxes"
|
||||
# (a sub-total line present on restaurant receipts) from being confused
|
||||
# with the final total when Tesseract splits a two-column label+amount
|
||||
# layout across lines.
|
||||
_TOTAL_RE = re.compile(
|
||||
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
|
||||
r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
|
||||
r'sale\s*total|you\s*paid|amount\s*paid|total)'
|
||||
r'(?!\s*tax)' # exclude "Total Tax / Total Taxes"
|
||||
r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
@@ -52,6 +58,18 @@ def _is_likely_bank_statement(text: str) -> bool:
|
||||
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
|
||||
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
|
||||
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
|
||||
# "05 MAY 2026" or "MAY 05 2026" or "05 May, 2026" (airline / hotel receipts)
|
||||
_DATE_MON_RE = re.compile(
|
||||
r'\b(\d{1,2})\s+([A-Za-z]{3,9})[,\s]+(\d{4})\b' # DD MON YYYY
|
||||
r'|\b([A-Za-z]{3,9})\s+(\d{1,2})[,\s]+(\d{4})\b', # MON DD YYYY
|
||||
)
|
||||
_MONTH_MAP: dict[str, int] = {
|
||||
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
|
||||
'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
|
||||
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
||||
'june': 6, 'july': 7, 'august': 8, 'september': 9,
|
||||
'october': 10, 'november': 11, 'december': 12,
|
||||
}
|
||||
|
||||
|
||||
def _extract_amount_from_text(text: str) -> float:
|
||||
@@ -71,16 +89,22 @@ def _extract_amount_from_text(text: str) -> float:
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
# Pass 1: explicit label match
|
||||
# Pass 1: explicit label match — return the LARGEST labeled amount.
|
||||
# Using max() rather than the last positional match handles the common
|
||||
# OCR artefact where "Total\n$2.80" (garbled "Total Taxes") appears
|
||||
# before "Total\n$42.90" in the text; the actual total wins on value.
|
||||
matches = list(_TOTAL_RE.finditer(text))
|
||||
if matches:
|
||||
raw = matches[-1].group(1).replace(',', '')
|
||||
try:
|
||||
val = float(raw)
|
||||
if val > 0:
|
||||
return val
|
||||
except ValueError:
|
||||
pass
|
||||
best_labeled = 0.0
|
||||
for m in matches:
|
||||
try:
|
||||
val = float(m.group(1).replace(',', ''))
|
||||
if val > best_labeled:
|
||||
best_labeled = val
|
||||
except ValueError:
|
||||
pass
|
||||
if best_labeled > 0:
|
||||
return best_labeled
|
||||
|
||||
# Pass 2: maximum dollar amount across the full text
|
||||
best = 0.0
|
||||
@@ -121,6 +145,19 @@ def _extract_date_from_text(text: str) -> str | None:
|
||||
if 1 <= mo <= 12 and 1 <= d <= 31:
|
||||
y = 2000 + yr if yr < 50 else 1900 + yr
|
||||
return f'{y}-{mo:02d}-{d:02d}'
|
||||
# Month-name formats: "05 MAY 2026", "MAY 05 2026", "05 May, 2026"
|
||||
# Common on airline, hotel, and formal business receipts.
|
||||
m = _DATE_MON_RE.search(text)
|
||||
if m:
|
||||
if m.group(1): # DD MON YYYY branch
|
||||
d_s, mon_s, y_s = m.group(1), m.group(2), m.group(3)
|
||||
else: # MON DD YYYY branch
|
||||
mon_s, d_s, y_s = m.group(4), m.group(5), m.group(6)
|
||||
mo = _MONTH_MAP.get(mon_s.lower()[:3])
|
||||
if mo:
|
||||
d_i, y_i = int(d_s), int(y_s)
|
||||
if 1 <= d_i <= 31 and 2000 <= y_i <= 2099:
|
||||
return f'{y_i}-{mo:02d}-{d_i:02d}'
|
||||
return None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -100,6 +100,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
|
||||
except Exception:
|
||||
pass # exif_transpose requires Pillow >= 6.0
|
||||
|
||||
# ── Step 1b: Content-based rotation correction ───────────────────────
|
||||
# EXIF transpose (Step 1) only corrects for phone-tilt metadata.
|
||||
# If the receipt was physically laid sideways in the frame (e.g. a
|
||||
# landscape receipt photographed with the phone upright), the pixels
|
||||
# are genuinely rotated and EXIF can't help. Ask Tesseract's OSD
|
||||
# engine to detect the text orientation and rotate to correct it.
|
||||
try:
|
||||
osd = pytesseract.image_to_osd(img, config='--psm 0')
|
||||
_am = re.search(r'Rotate:\s*(\d+)', osd)
|
||||
if _am:
|
||||
_angle = int(_am.group(1))
|
||||
if _angle:
|
||||
img = img.rotate(_angle, expand=True)
|
||||
logger.debug('OSD: rotated %s by %d°', filename, _angle)
|
||||
except Exception:
|
||||
pass # OSD unavailable or not enough text — proceed without correction
|
||||
|
||||
# ── Step 2: Resize to working width (1800px) ──────────────────────────
|
||||
max_w = 1800
|
||||
if img.width > max_w:
|
||||
|
||||
@@ -428,6 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
|
||||
|
||||
from agent_service.agents.expenses_agent import (
|
||||
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
|
||||
_MONTH_MAP,
|
||||
)
|
||||
|
||||
|
||||
@@ -473,6 +474,19 @@ class TestExtractAmount:
|
||||
text = 'Items 5.00\nTax 0.50\nTotal\n5.50'
|
||||
assert _extract_amount_from_text(text) == 5.50
|
||||
|
||||
def test_total_taxes_excluded(self):
|
||||
# "Total Taxes $2.80" must NOT be confused with the receipt total;
|
||||
# the labeled-total regex excludes 'total tax/taxes' via lookahead.
|
||||
text = 'Subtotal $40.10\nTotal Taxes $2.80\nTotal $42.90'
|
||||
assert _extract_amount_from_text(text) == 42.90
|
||||
|
||||
def test_pass1_returns_max_not_last(self):
|
||||
# If OCR garbles "Total Taxes" into "Total\n$2.80", _TOTAL_RE would
|
||||
# accidentally match twice. max() must win over positional [-1].
|
||||
# Simulate by giving two labeled totals where smaller appears second.
|
||||
text = 'Grand Total $42.90\nTotal $2.80'
|
||||
assert _extract_amount_from_text(text) == 42.90
|
||||
|
||||
def test_total_sale_gas_station(self):
|
||||
# Costco / Shell gas receipts say "Total Sale $X.XX", not "Total: $X.XX"
|
||||
text = 'Pump 9 16.189 Gal\nRegular $ 58.75\nTotal Sale $ 58.75'
|
||||
@@ -566,6 +580,20 @@ class TestExtractDate:
|
||||
def test_us_short_year(self):
|
||||
assert _extract_date_from_text('05/09/26') == '2026-05-09'
|
||||
|
||||
def test_dd_mon_yyyy(self):
|
||||
# Airline / hotel receipts: "05 MAY 2026", "Issue Date: 05 May 2026"
|
||||
assert _extract_date_from_text('Issue Date: 05 MAY 2026 MIA A70') == '2026-05-05'
|
||||
|
||||
def test_mon_dd_yyyy(self):
|
||||
assert _extract_date_from_text('MAY 05 2026') == '2026-05-05'
|
||||
|
||||
def test_mon_dd_comma_yyyy(self):
|
||||
assert _extract_date_from_text('May 5, 2026') == '2026-05-05'
|
||||
|
||||
def test_month_map_completeness(self):
|
||||
# All twelve three-letter abbreviations must be present
|
||||
assert len({k for k in _MONTH_MAP if len(k) == 3}) == 12
|
||||
|
||||
def test_no_date(self):
|
||||
assert _extract_date_from_text('No date here') is None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user