feat: OCR via tesseract, dedup, category selection for expense receipts
- Dockerfile: install tesseract-ocr so Pillow+pytesseract can OCR receipt images - operational_store: JSON-serialize raw_data before passing to asyncpg JSONB - receipt_parser: add SHA256 hash + date extracted from filename timestamps - expenses_agent: deduplicate receipts by hash before creating expense records - expenses_agent: fetch all expensable Odoo products, pass list to LLM for category selection (Meals, Flights, etc.) per receipt - expenses_agent: pass date_hint from filename (e.g. 20260509_180857.jpg -> 2026-05-09) as fallback when OCR text is unavailable - expenses_tools: add get_expense_products() to fetch all expensable products Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,12 +1,17 @@
|
||||
from __future__ import annotations
|
||||
import base64
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Extract YYYYMMDD from filenames like 20260509_180857.jpg
|
||||
_DATE_PATTERN = re.compile(r'(\d{4})(\d{2})(\d{2})_\d{6}')
|
||||
|
||||
_MIME = {
|
||||
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png', '.gif': 'image/gif',
|
||||
@@ -31,6 +36,13 @@ def parse_upload(filename: str, data: bytes) -> list[dict]:
|
||||
|
||||
b64 = base64.b64encode(data).decode()
|
||||
mimetype = _MIME.get(ext, 'application/octet-stream')
|
||||
sha256 = hashlib.sha256(data).hexdigest()
|
||||
|
||||
# Extract date from timestamp-style filenames (e.g. 20260509_180857.jpg)
|
||||
date_from_name = None
|
||||
m = _DATE_PATTERN.search(filename)
|
||||
if m:
|
||||
date_from_name = f'{m.group(1)}-{m.group(2)}-{m.group(3)}'
|
||||
|
||||
if ext in _IMAGE_EXTS:
|
||||
text = _ocr_image(data, filename)
|
||||
@@ -46,7 +58,8 @@ def parse_upload(filename: str, data: bytes) -> list[dict]:
|
||||
except Exception:
|
||||
text = f'[Binary file: {filename}]'
|
||||
|
||||
return [{'filename': filename, 'text': text, 'b64': b64, 'mimetype': mimetype}]
|
||||
return [{'filename': filename, 'text': text, 'b64': b64, 'mimetype': mimetype,
|
||||
'sha256': sha256, 'date_from_name': date_from_name}]
|
||||
|
||||
|
||||
def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
||||
|
||||
Reference in New Issue
Block a user