fix(expenses): LAYAL CAFE $2.80 bug, United Airlines rotation & date

LAYAL CAFE ($2.80 instead of $42.90):
- Add (?!\s*tax) lookahead to _TOTAL_RE so "Total Taxes $2.80" is never
  confused with the receipt total when OCR drops the "Taxes" word
- Change Pass 1 from matches[-1] to max() so the largest labeled amount
  always wins, regardless of line order in the OCR output

United Airlines (Subway/$0/wrong date):
- Add OSD-based rotation correction in receipt_parser.py: after EXIF
  transpose, ask Tesseract's orientation-detection engine (--psm 0) what
  angle to rotate; applies to receipts photographed lying sideways where
  EXIF metadata cannot help
- Add month-name date patterns (DD MON YYYY / MON DD YYYY) to
  _extract_date_from_text for airline/hotel receipts that print dates
  like "05 MAY 2026" instead of "05/07/26"

85 tests, all passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-21 00:46:08 -04:00
parent ce57d19528
commit ece811cccb
3 changed files with 90 additions and 8 deletions

View File

@@ -15,10 +15,16 @@ from ..tools.expenses_tools import ExpensesTools
# Matches an explicitly labeled total line.
# Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46",
# "Total Sale $58.75" (gas stations), "Net Sale $X", etc.
#
# The negative lookahead (?!\s*tax) prevents "Total Tax" / "Total Taxes"
# (a sub-total line present on restaurant receipts) from being confused
# with the final total when Tesseract splits a two-column label+amount
# layout across lines.
_TOTAL_RE = re.compile(
r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|'
r'total\s*amount|total\s*charged|total\s*sale|net\s*sale|'
r'sale\s*total|you\s*paid|amount\s*paid|total)'
r'(?!\s*tax)' # exclude "Total Tax / Total Taxes"
r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})',
re.IGNORECASE,
)
@@ -52,6 +58,18 @@ def _is_likely_bank_statement(text: str) -> bool:
_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD
_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY
_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY
# "05 MAY 2026" or "MAY 05 2026" or "05 May, 2026" (airline / hotel receipts)
_DATE_MON_RE = re.compile(
r'\b(\d{1,2})\s+([A-Za-z]{3,9})[,\s]+(\d{4})\b' # DD MON YYYY
r'|\b([A-Za-z]{3,9})\s+(\d{1,2})[,\s]+(\d{4})\b', # MON DD YYYY
)
_MONTH_MAP: dict[str, int] = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'june': 6, 'july': 7, 'august': 8, 'september': 9,
'october': 10, 'november': 11, 'december': 12,
}
def _extract_amount_from_text(text: str) -> float:
@@ -71,16 +89,22 @@ def _extract_amount_from_text(text: str) -> float:
if not text:
return 0.0
# Pass 1: explicit label match
# Pass 1: explicit label match — return the LARGEST labeled amount.
# Using max() rather than the last positional match handles the common
# OCR artefact where "Total\n$2.80" (garbled "Total Taxes") appears
# before "Total\n$42.90" in the text; the actual total wins on value.
matches = list(_TOTAL_RE.finditer(text))
if matches:
raw = matches[-1].group(1).replace(',', '')
try:
val = float(raw)
if val > 0:
return val
except ValueError:
pass
best_labeled = 0.0
for m in matches:
try:
val = float(m.group(1).replace(',', ''))
if val > best_labeled:
best_labeled = val
except ValueError:
pass
if best_labeled > 0:
return best_labeled
# Pass 2: maximum dollar amount across the full text
best = 0.0
@@ -121,6 +145,19 @@ def _extract_date_from_text(text: str) -> str | None:
if 1 <= mo <= 12 and 1 <= d <= 31:
y = 2000 + yr if yr < 50 else 1900 + yr
return f'{y}-{mo:02d}-{d:02d}'
# Month-name formats: "05 MAY 2026", "MAY 05 2026", "05 May, 2026"
# Common on airline, hotel, and formal business receipts.
m = _DATE_MON_RE.search(text)
if m:
if m.group(1): # DD MON YYYY branch
d_s, mon_s, y_s = m.group(1), m.group(2), m.group(3)
else: # MON DD YYYY branch
mon_s, d_s, y_s = m.group(4), m.group(5), m.group(6)
mo = _MONTH_MAP.get(mon_s.lower()[:3])
if mo:
d_i, y_i = int(d_s), int(y_s)
if 1 <= d_i <= 31 and 2000 <= y_i <= 2099:
return f'{y_i}-{mo:02d}-{d_i:02d}'
return None
logger = logging.getLogger(__name__)

View File

@@ -100,6 +100,23 @@ def _ocr_image_tesseract(data: bytes, filename: str) -> str:
except Exception:
pass # exif_transpose requires Pillow >= 6.0
# ── Step 1b: Content-based rotation correction ───────────────────────
# EXIF transpose (Step 1) only corrects for phone-tilt metadata.
# If the receipt was physically laid sideways in the frame (e.g. a
# landscape receipt photographed with the phone upright), the pixels
# are genuinely rotated and EXIF can't help. Ask Tesseract's OSD
# engine to detect the text orientation and rotate to correct it.
try:
osd = pytesseract.image_to_osd(img, config='--psm 0')
_am = re.search(r'Rotate:\s*(\d+)', osd)
if _am:
_angle = int(_am.group(1))
if _angle:
img = img.rotate(_angle, expand=True)
logger.debug('OSD: rotated %s by %d°', filename, _angle)
except Exception:
pass # OSD unavailable or not enough text — proceed without correction
# ── Step 2: Resize to working width (1800px) ──────────────────────────
max_w = 1800
if img.width > max_w:

View File

@@ -428,6 +428,7 @@ async def test_act_no_employee_returns_empty_and_escalates():
from agent_service.agents.expenses_agent import (
_extract_amount_from_text, _extract_date_from_text, _is_likely_bank_statement,
_MONTH_MAP,
)
@@ -473,6 +474,19 @@ class TestExtractAmount:
text = 'Items 5.00\nTax 0.50\nTotal\n5.50'
assert _extract_amount_from_text(text) == 5.50
def test_total_taxes_excluded(self):
# "Total Taxes $2.80" must NOT be confused with the receipt total;
# the labeled-total regex excludes 'total tax/taxes' via lookahead.
text = 'Subtotal $40.10\nTotal Taxes $2.80\nTotal $42.90'
assert _extract_amount_from_text(text) == 42.90
def test_pass1_returns_max_not_last(self):
# If OCR garbles "Total Taxes" into "Total\n$2.80", _TOTAL_RE would
# accidentally match twice. max() must win over positional [-1].
# Simulate by giving two labeled totals where smaller appears second.
text = 'Grand Total $42.90\nTotal $2.80'
assert _extract_amount_from_text(text) == 42.90
def test_total_sale_gas_station(self):
# Costco / Shell gas receipts say "Total Sale $X.XX", not "Total: $X.XX"
text = 'Pump 9 16.189 Gal\nRegular $ 58.75\nTotal Sale $ 58.75'
@@ -566,6 +580,20 @@ class TestExtractDate:
def test_us_short_year(self):
assert _extract_date_from_text('05/09/26') == '2026-05-09'
def test_dd_mon_yyyy(self):
# Airline / hotel receipts: "05 MAY 2026", "Issue Date: 05 May 2026"
assert _extract_date_from_text('Issue Date: 05 MAY 2026 MIA A70') == '2026-05-05'
def test_mon_dd_yyyy(self):
assert _extract_date_from_text('MAY 05 2026') == '2026-05-05'
def test_mon_dd_comma_yyyy(self):
assert _extract_date_from_text('May 5, 2026') == '2026-05-05'
def test_month_map_completeness(self):
# All twelve three-letter abbreviations must be present
assert len({k for k in _MONTH_MAP if len(k) == 3}) == 12
def test_no_date(self):
assert _extract_date_from_text('No date here') is None