diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index 00a2367..f6be67e 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -3,10 +3,65 @@ import asyncio import difflib import json import logging +import re from datetime import date as _date from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport from ..tools.expenses_tools import ExpensesTools +# --------------------------------------------------------------------------- +# Receipt OCR helpers — regex-based, deterministic extraction +# --------------------------------------------------------------------------- + +# Matches the final-total line on a receipt. +# Handles "Total: $22.46", "GRAND TOTAL 22.46", "Amount Due: 22.46", etc. +_TOTAL_RE = re.compile( + r'(?:grand\s*total|total\s*due|amount\s*due|balance\s*due|' + r'total\s*amount|total\s*charged|you\s*paid|amount\s*paid|total)' + r'\s*[:\-]?\s*\$?\s*([\d,]+\.\d{2})', + re.IGNORECASE, +) + +_DATE_ISO_RE = re.compile(r'\b(\d{4})[-/](\d{2})[-/](\d{2})\b') # YYYY-MM-DD or YYYY/MM/DD +_DATE_US_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b') # M/D/YYYY +_DATE_US_SHORT_RE = re.compile(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b') # M/D/YY + + +def _extract_amount_from_text(text: str) -> float: + """Return the final total from OCR receipt text, or 0.0 if not found.""" + if not text: + return 0.0 + matches = list(_TOTAL_RE.finditer(text)) + if matches: + raw = matches[-1].group(1).replace(',', '') # last match = grand total + try: + return float(raw) + except ValueError: + pass + return 0.0 + + +def _extract_date_from_text(text: str) -> str | None: + """Return the first plausible date in OCR text as YYYY-MM-DD, or None.""" + if not text: + return None + m = _DATE_ISO_RE.search(text) + if m: + y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3)) + if 2000 <= y <= 2099 and 1 <= mo <= 12 and 1 <= d <= 31: + return f'{y}-{mo:02d}-{d:02d}' + m = _DATE_US_RE.search(text) + if m: + mo, d, y = int(m.group(1)), int(m.group(2)), int(m.group(3)) + if 1 <= mo <= 12 and 1 <= d <= 31 and y >= 2000: + return f'{y}-{mo:02d}-{d:02d}' + m = _DATE_US_SHORT_RE.search(text) + if m: + mo, d, yr = int(m.group(1)), int(m.group(2)), int(m.group(3)) + if 1 <= mo <= 12 and 1 <= d <= 31: + y = 2000 + yr if yr < 50 else 1900 + yr + return f'{y}-{mo:02d}-{d:02d}' + return None + logger = logging.getLogger(__name__) EXPENSES_TOOLS = [ @@ -320,70 +375,61 @@ class ExpensesAgent(BaseAgent): async def _parse_receipt_text(self, text: str, filename: str, expense_products: list = None, date_hint: str = None) -> dict: - today = _date.today().isoformat() - fallback = {'vendor': filename, 'amount': 0.0, - 'date': date_hint or today, 'time': None, 'product_name': ''} + """Parse a single receipt into structured fields. + Strategy (most-reliable first): + amount → regex on OCR text (deterministic) + date → filename timestamp > OCR regex > today + vendor → LLM (short excerpt, first ~600 chars) + product_name→ LLM (semantic match against expense product list) + + The LLM is intentionally NOT asked for amount or date — the local + model hallucinates those fields when OCR text is ambiguous. + """ + today = _date.today().isoformat() stripped = (text or '').strip() ocr_failed = not stripped or stripped.startswith('[') - product_list = '' - if expense_products: - names = [p['name'] for p in expense_products] - product_list = ', '.join(f'"{n}"' for n in names) + # ── Amount: regex (deterministic) ──────────────────────────────────── + amount = _extract_amount_from_text(stripped) if not ocr_failed else 0.0 - if ocr_failed: - # No OCR text — still try to classify category from filename/date - if not product_list: - return fallback + # ── Date: filename > OCR regex > today ─────────────────────────────── + if date_hint: + date = date_hint + elif not ocr_failed: + date = _extract_date_from_text(stripped) or today + else: + date = today + + # ── Vendor + Category: LLM (two fields only) ───────────────────────── + vendor = filename + product_name = '' + product_list = ', '.join(f'"{p["name"]}"' for p in (expense_products or [])) + + if not ocr_failed: + # Give LLM only the header of the receipt — vendor is in the first lines + excerpt = stripped[:600] prompt = ( - f'A receipt photo named "{filename}" could not be read by OCR. ' - f'Based only on the filename, pick the most likely expense category ' - f'from this list: [{product_list}]. ' - f'Return ONLY valid JSON: {{"product_name": "..."}}' + 'Return ONLY valid JSON with exactly two keys:\n' + '"vendor": the store or restaurant name, copied exactly from the ' + 'first 1-3 lines of the receipt. Use "" if no clear name.\n' + f'"product_name": the single best match from [{product_list}] ' + 'based on the type of business (restaurant→Meals, gas station→Fuel, ' + 'hotel→Hotel, airline/transit→Transport, office store→Office Supplies). ' + 'Use "" if none fit.\n\n' + f'Receipt:\n{excerpt}\n\nJSON only:' + ) + elif product_list: + # OCR failed — guess category from filename only + prompt = ( + f'A receipt file named "{filename}" could not be read. ' + f'Pick the most likely match from [{product_list}] based on the filename, ' + f'or "". Return ONLY: {{"vendor": "", "product_name": "..."}}' ) else: - # Keep both the header (vendor/date) and footer (totals) of the receipt. - # A plain [:N] cut discards the bottom of long receipts where the grand - # total lives — the primary cause of amount=0 extraction errors. - if len(stripped) > 3000: - receipt_text = stripped[:1500] + '\n[...]\n' + stripped[-1500:] - else: - receipt_text = stripped + return {'vendor': filename, 'amount': amount, 'date': date, + 'time': None, 'product_name': ''} - # When the filename carries a reliable timestamp, inject it directly - # so the LLM doesn't try to read (and potentially misread) the date - # from garbled OCR text. - if date_hint: - date_instruction = ( - f'Use exactly "{date_hint}" — this date was read from the file ' - f'timestamp and is more reliable than the OCR text.' - ) - else: - date_instruction = ( - f'Extract from the receipt text in YYYY-MM-DD format; ' - f'use {today} only if no date is visible.' - ) - - prompt = ( - 'You are a receipt data extractor. ' - 'Copy values EXACTLY as they appear in the text — ' - 'do NOT guess, infer, "correct" OCR errors, or invent plausible values.\n\n' - 'Return ONLY valid JSON with these keys:\n' - f'"vendor": merchant name exactly as printed; ' - f'empty string "" if you cannot find it clearly,\n' - f'"amount": the FINAL total — find a line labeled "Total", "Grand Total", ' - f'"Amount Due", or "Balance Due"; copy the number exactly as written; ' - f'never use subtotal, tax, or tip lines; ' - f'return 0 if no clearly labeled final total is present,\n' - f'"date": {date_instruction}\n' - f'"time": transaction time HH:MM (24-hour) exactly as printed, or null,\n' - f'"product_name": best match from [{product_list}] or "".\n\n' - f'IMPORTANT: This text came from OCR and may contain garbled characters. ' - f'If a value looks corrupted, return the safe default (0 / "" / null) ' - f'rather than substituting a "more logical" value.\n\n' - f'Receipt text:\n{receipt_text}\n\nJSON only:' - ) try: resp = await self._llm.submit( [{'role': 'user', 'content': prompt}], @@ -393,16 +439,15 @@ class ExpensesAgent(BaseAgent): first, last = raw.find('{'), raw.rfind('}') if first != -1 and last > first: data = json.loads(raw[first:last + 1]) - return { - 'vendor': str(data.get('vendor', filename)), - 'amount': float(data.get('amount', 0.0)), - 'date': str(data.get('date') or date_hint or today), - 'time': data.get('time') or None, - 'product_name': str(data.get('product_name', '')), - } + v = str(data.get('vendor', '') or '').strip() + if v: + vendor = v + product_name = str(data.get('product_name', '') or '').strip() except Exception as exc: - logger.warning('Receipt parse failed for %s: %s', filename, exc) - return fallback + logger.warning('Receipt vendor/category parse failed for %s: %s', filename, exc) + + return {'vendor': vendor, 'amount': amount, 'date': date, + 'time': None, 'product_name': product_name} async def _report(self) -> AgentReport: data = self._gathered_data diff --git a/agent_service/routers/approval.py b/agent_service/routers/approval.py index 5d7f587..39137ff 100644 --- a/agent_service/routers/approval.py +++ b/agent_service/routers/approval.py @@ -33,8 +33,8 @@ async def list_pending(): raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail='DB not ready') async with pool.acquire(timeout=10) as conn: rows = await conn.fetch( - 'SELECT directive_id, agent_name, action_type, description, created_at, context_data ' - 'FROM ab_directive_log WHERE status = $1 ORDER BY created_at ASC', + 'SELECT directive_id, agent_name, action_type, description, started_at, context_data ' + 'FROM ab_directive_log WHERE status = $1 ORDER BY started_at ASC', 'pending_approval', ) return [ @@ -43,7 +43,7 @@ async def list_pending(): agent=r['agent_name'] or '', action=r['action_type'] or '', description=r['description'] or '', - created_at=str(r['created_at']), + created_at=str(r['started_at'] or ''), context=r['context_data'] or {}, ) for r in rows diff --git a/tests/test_expenses_agent.py b/tests/test_expenses_agent.py index 0b1b332..99eabc0 100644 --- a/tests/test_expenses_agent.py +++ b/tests/test_expenses_agent.py @@ -423,15 +423,73 @@ async def test_act_no_employee_returns_empty_and_escalates(): # --------------------------------------------------------------------------- -# _parse_receipt_text — LLM extraction path +# _extract_amount_from_text / _extract_date_from_text — regex helpers +# --------------------------------------------------------------------------- + +from agent_service.agents.expenses_agent import ( + _extract_amount_from_text, _extract_date_from_text, +) + + +class TestExtractAmount: + def test_simple_total(self): + assert _extract_amount_from_text('Acme\nTotal: $9.99') == 9.99 + + def test_grand_total(self): + assert _extract_amount_from_text('Subtotal: $20.00\nGrand Total: $22.46') == 22.46 + + def test_amount_due(self): + assert _extract_amount_from_text('Amount Due: 198.40') == 198.40 + + def test_no_dollar_sign(self): + assert _extract_amount_from_text('TOTAL 15.75') == 15.75 + + def test_last_match_wins(self): + # Grand total should beat subtotal + text = 'Subtotal 18.00\nTax 1.50\nTotal 19.50' + assert _extract_amount_from_text(text) == 19.50 + + def test_empty_text(self): + assert _extract_amount_from_text('') == 0.0 + + def test_no_total_line(self): + assert _extract_amount_from_text('No price here') == 0.0 + + def test_comma_in_amount(self): + assert _extract_amount_from_text('Grand Total: $1,234.56') == 1234.56 + + +class TestExtractDate: + def test_iso_format(self): + assert _extract_date_from_text('Date: 2026-05-09') == '2026-05-09' + + def test_slash_iso(self): + assert _extract_date_from_text('2026/05/09') == '2026-05-09' + + def test_us_format(self): + assert _extract_date_from_text('05/09/2026') == '2026-05-09' + + def test_us_short_year(self): + assert _extract_date_from_text('05/09/26') == '2026-05-09' + + def test_no_date(self): + assert _extract_date_from_text('No date here') is None + + def test_empty(self): + assert _extract_date_from_text('') is None + + +# --------------------------------------------------------------------------- +# _parse_receipt_text — combined extraction # --------------------------------------------------------------------------- @pytest.mark.asyncio -async def test_parse_plain_ocr_text_uses_llm(): - """Plain OCR text should go through the LLM extraction path.""" +async def test_parse_plain_ocr_text_uses_llm_for_vendor(): + """Regex extracts amount; LLM called only for vendor + product_name.""" agent = _make_agent() llm_resp = MagicMock() - llm_resp.content = '{"vendor":"Acme","amount":9.99,"date":"2026-05-09","time":null,"product_name":"Meals"}' + # LLM now only returns vendor + product_name + llm_resp.content = '{"vendor":"Acme","product_name":"Meals"}' agent._llm.submit = AsyncMock(return_value=llm_resp) result = await agent._parse_receipt_text( @@ -439,10 +497,44 @@ async def test_parse_plain_ocr_text_uses_llm(): expense_products=[{'id': 1, 'name': 'Meals'}], ) assert result['vendor'] == 'Acme' - assert result['amount'] == 9.99 + assert result['amount'] == 9.99 # from regex, not LLM + assert result['product_name'] == 'Meals' agent._llm.submit.assert_called_once() +@pytest.mark.asyncio +async def test_parse_date_hint_overrides_ocr_date(): + """date_hint from filename must be used; LLM date should be ignored.""" + agent = _make_agent() + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"Shell","product_name":"Fuel"}' + agent._llm.submit = AsyncMock(return_value=llm_resp) + + result = await agent._parse_receipt_text( + 'Shell Gas\n05/09/2021\nTotal: $45.00', 'shell.jpg', + date_hint='2026-05-09', + ) + assert result['date'] == '2026-05-09' # filename timestamp wins + assert result['amount'] == 45.00 + + +@pytest.mark.asyncio +async def test_parse_ocr_failed_skips_llm_amount(): + """When OCR fails, amount=0 and date comes from hint or today.""" + agent = _make_agent() + llm_resp = MagicMock() + llm_resp.content = '{"vendor":"","product_name":"Meals"}' + agent._llm.submit = AsyncMock(return_value=llm_resp) + + result = await agent._parse_receipt_text( + '[Image: broken.jpg — OCR failed]', 'broken.jpg', + date_hint='2026-05-10', + expense_products=[{'id': 1, 'name': 'Meals'}], + ) + assert result['amount'] == 0.0 + assert result['date'] == '2026-05-10' + + # --------------------------------------------------------------------------- # parse_upload — receipt_parser.py # ---------------------------------------------------------------------------