feat: file upload + expense report creation from Discuss attachments
- Discuss bot now reads ir.attachment from incoming messages; file-only messages no longer silently dropped - ZIP files are described (contents listed) and bot asks clarifying question before acting; user's follow-up reply looks back for pending attachments so files don't need to be re-uploaded - receipt_parser: extracts text from ZIP (recursive), JPG/PNG/etc (OCR), PDF (pdfplumber), HTML, TXT - expenses_agent: full rewrite fixing broken method signatures; adds create_expense_sheet / create_expense / attach_receipt flow driven by LLM receipt parsing (Ollama, HIPAA-locked) - master_agent: extra_context threads receipts + user_id into directives - FastAPI /upload multipart endpoint; registered in main.py - Odoo /ai/upload controller proxies files to agent service - ab_ai_bot: dispatch_message_with_files() for multipart uploads Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ import logging
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from odoo import http
|
from odoo import http
|
||||||
from odoo.http import request
|
from odoo.http import request, Response
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -58,3 +58,41 @@ class AiApprovalController(http.Controller):
|
|||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@http.route('/ai/upload', type='http', auth='user', methods=['POST'], csrf=False)
|
||||||
|
def upload(self, **kwargs):
|
||||||
|
bot = request.env['ab.ai.bot'].sudo().get_active_bot()
|
||||||
|
if not bot:
|
||||||
|
return Response(
|
||||||
|
json.dumps({'error': 'No bot configured'}),
|
||||||
|
content_type='application/json', status=503)
|
||||||
|
|
||||||
|
url = bot._get_service_url() + '/upload'
|
||||||
|
message = request.httprequest.form.get(
|
||||||
|
'message', 'Create an employee expense report from these receipts.')
|
||||||
|
session_id = request.httprequest.form.get('session_id', '')
|
||||||
|
|
||||||
|
files_data = [
|
||||||
|
('files', (f.filename, f.read(), f.content_type or 'application/octet-stream'))
|
||||||
|
for f in request.httprequest.files.getlist('files')
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
url,
|
||||||
|
data={
|
||||||
|
'user_id': str(request.env.user.id),
|
||||||
|
'message': message,
|
||||||
|
'session_id': session_id,
|
||||||
|
},
|
||||||
|
files=files_data or [('files', ('empty', b'', 'application/octet-stream'))],
|
||||||
|
headers=bot._build_headers(),
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return Response(resp.text, content_type='application/json')
|
||||||
|
except Exception as exc:
|
||||||
|
_logger.error('upload proxy failed: %s', exc)
|
||||||
|
return Response(
|
||||||
|
json.dumps({'error': str(exc), 'reply': 'Upload failed. Please try again.'}),
|
||||||
|
content_type='application/json', status=500)
|
||||||
|
|||||||
@@ -111,6 +111,44 @@ class AbAiBot(models.Model):
|
|||||||
_logger.error('dispatch_message failed: %s', exc)
|
_logger.error('dispatch_message failed: %s', exc)
|
||||||
raise UserError(_('Could not reach AI service: %s') % exc)
|
raise UserError(_('Could not reach AI service: %s') % exc)
|
||||||
|
|
||||||
|
def dispatch_message_with_files(self, user_id, message, attachments, context=None, session_id=None):
|
||||||
|
"""Send a message with file attachments to the /upload endpoint as multipart."""
|
||||||
|
self.ensure_one()
|
||||||
|
import base64
|
||||||
|
url = self._get_service_url() + '/upload'
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for att in attachments:
|
||||||
|
try:
|
||||||
|
data = base64.b64decode(att.datas) if att.datas else b''
|
||||||
|
files.append(('files', (att.name or 'attachment',
|
||||||
|
data,
|
||||||
|
att.mimetype or 'application/octet-stream')))
|
||||||
|
except Exception as exc:
|
||||||
|
_logger.warning('Could not encode attachment %s: %s', att.id, exc)
|
||||||
|
|
||||||
|
# Omit Content-Type so requests sets the multipart boundary automatically
|
||||||
|
headers = {}
|
||||||
|
if self.webhook_secret:
|
||||||
|
headers['X-ActiveBlue-Signature'] = self.webhook_secret
|
||||||
|
|
||||||
|
form_data = {
|
||||||
|
'user_id': str(user_id),
|
||||||
|
'message': message or 'Create an employee expense report from these receipts.',
|
||||||
|
'session_id': session_id or '',
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(url, data=form_data, files=files or [('files', ('empty', b'', 'text/plain'))],
|
||||||
|
headers=headers, timeout=120)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
raise UserError(_('AI service timed out. Please try again.'))
|
||||||
|
except requests.exceptions.RequestException as exc:
|
||||||
|
_logger.error('dispatch_message_with_files failed: %s', exc)
|
||||||
|
raise UserError(_('Could not reach AI service: %s') % exc)
|
||||||
|
|
||||||
@api.model
|
@api.model
|
||||||
def cron_ping_all(self):
|
def cron_ping_all(self):
|
||||||
any_online = False
|
any_online = False
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import zipfile
|
||||||
|
|
||||||
from odoo import models, api
|
from odoo import models, api
|
||||||
|
|
||||||
@@ -8,11 +11,52 @@ _logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
_HTML_TAG = re.compile(r'<[^>]+>')
|
_HTML_TAG = re.compile(r'<[^>]+>')
|
||||||
|
|
||||||
|
# How many recent messages to scan when looking for a pending file upload
|
||||||
|
_LOOKBACK_MESSAGES = 10
|
||||||
|
|
||||||
|
# File type labels shown in the clarification message
|
||||||
|
_EXT_LABELS = {
|
||||||
|
'jpg': 'image', 'jpeg': 'image', 'png': 'image', 'gif': 'image',
|
||||||
|
'bmp': 'image', 'tiff': 'image', 'tif': 'image', 'webp': 'image',
|
||||||
|
'pdf': 'PDF', 'html': 'HTML', 'htm': 'HTML',
|
||||||
|
'txt': 'text', 'csv': 'spreadsheet', 'xlsx': 'spreadsheet',
|
||||||
|
'zip': 'ZIP archive',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _strip_html(html: str) -> str:
|
def _strip_html(html: str) -> str:
|
||||||
return _HTML_TAG.sub(' ', html or '').strip()
|
return _HTML_TAG.sub(' ', html or '').strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _ext(filename: str) -> str:
|
||||||
|
return filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||||
|
|
||||||
|
|
||||||
|
def _describe_zip(datas_b64: str, zip_name: str) -> str:
|
||||||
|
"""Return a short HTML summary of a ZIP's contents without extracting data."""
|
||||||
|
try:
|
||||||
|
raw = base64.b64decode(datas_b64)
|
||||||
|
with zipfile.ZipFile(io.BytesIO(raw)) as zf:
|
||||||
|
members = [m for m in zf.namelist() if not m.endswith('/')]
|
||||||
|
if not members:
|
||||||
|
return f'<b>{zip_name}</b> (empty archive)'
|
||||||
|
# Count by type
|
||||||
|
counts: dict[str, int] = {}
|
||||||
|
for m in members:
|
||||||
|
label = _EXT_LABELS.get(_ext(m), 'file')
|
||||||
|
counts[label] = counts.get(label, 0) + 1
|
||||||
|
type_summary = ', '.join(f'{n} {t}(s)' for t, n in counts.items())
|
||||||
|
lines = [f'<b>{zip_name}</b> — {len(members)} item(s): {type_summary}']
|
||||||
|
for m in members[:8]:
|
||||||
|
lines.append(f' • {m}')
|
||||||
|
if len(members) > 8:
|
||||||
|
lines.append(f' … and {len(members) - 8} more')
|
||||||
|
return '<br/>'.join(lines)
|
||||||
|
except Exception as exc:
|
||||||
|
_logger.warning('_describe_zip failed for %s: %s', zip_name, exc)
|
||||||
|
return f'<b>{zip_name}</b> (could not inspect contents)'
|
||||||
|
|
||||||
|
|
||||||
class DiscussChannel(models.Model):
|
class DiscussChannel(models.Model):
|
||||||
_inherit = 'discuss.channel'
|
_inherit = 'discuss.channel'
|
||||||
|
|
||||||
@@ -40,10 +84,28 @@ class DiscussChannel(models.Model):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
text = _strip_html(body)
|
text = _strip_html(body)
|
||||||
if not text:
|
attachments = result.attachment_ids
|
||||||
|
|
||||||
|
# Nothing to work with
|
||||||
|
if not text and not attachments:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# Identify the human sender
|
# ── Case 1: file(s) with no instruction ──────────────────────────────
|
||||||
|
# Show the user what we received and ask what to do with it.
|
||||||
|
if attachments and not text:
|
||||||
|
self._post_file_clarification(attachments, bot_partner)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ── Case 2: text only — look back for a pending file upload ──────────
|
||||||
|
# If the user just replied to our clarification question, find the
|
||||||
|
# attachment(s) they uploaded earlier in this conversation.
|
||||||
|
pending = self.env['ir.attachment'].browse()
|
||||||
|
if text and not attachments:
|
||||||
|
pending = self._find_pending_attachments(bot_partner)
|
||||||
|
|
||||||
|
effective_attachments = attachments or pending
|
||||||
|
|
||||||
|
# ── Case 3: text (+ possibly pending files) → dispatch ───────────────
|
||||||
human_partner = member_partners.filtered(lambda p: p != bot_partner)[:1]
|
human_partner = member_partners.filtered(lambda p: p != bot_partner)[:1]
|
||||||
user = self.env['res.users'].search([('partner_id', '=', human_partner.id)], limit=1)
|
user = self.env['res.users'].search([('partner_id', '=', human_partner.id)], limit=1)
|
||||||
uid = user.id if user else self.env.uid
|
uid = user.id if user else self.env.uid
|
||||||
@@ -52,11 +114,21 @@ class DiscussChannel(models.Model):
|
|||||||
bot = self.env['ab.ai.bot'].sudo().search([('active', '=', True)], limit=1)
|
bot = self.env['ab.ai.bot'].sudo().search([('active', '=', True)], limit=1)
|
||||||
if not bot:
|
if not bot:
|
||||||
return result
|
return result
|
||||||
response = bot.dispatch_message(
|
|
||||||
user_id=uid,
|
if effective_attachments:
|
||||||
message=text,
|
response = bot.dispatch_message_with_files(
|
||||||
context={'channel_id': self.id, 'source': 'discuss'},
|
user_id=uid,
|
||||||
)
|
message=text,
|
||||||
|
attachments=effective_attachments,
|
||||||
|
context={'channel_id': self.id, 'source': 'discuss'},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response = bot.dispatch_message(
|
||||||
|
user_id=uid,
|
||||||
|
message=text,
|
||||||
|
context={'channel_id': self.id, 'source': 'discuss'},
|
||||||
|
)
|
||||||
|
|
||||||
reply = (response or {}).get('reply') or (response or {}).get('message') or \
|
reply = (response or {}).get('reply') or (response or {}).get('message') or \
|
||||||
'I could not process your request right now.'
|
'I could not process your request right now.'
|
||||||
self.sudo().message_post(
|
self.sudo().message_post(
|
||||||
@@ -69,3 +141,52 @@ class DiscussChannel(models.Model):
|
|||||||
_logger.error('AI bot Discuss reply failed: %s', exc)
|
_logger.error('AI bot Discuss reply failed: %s', exc)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _post_file_clarification(self, attachments, bot_partner):
|
||||||
|
"""Describe the uploaded file(s) and ask the user what to do with them."""
|
||||||
|
lines = []
|
||||||
|
for att in attachments:
|
||||||
|
name = att.name or 'file'
|
||||||
|
ext = _ext(name)
|
||||||
|
if ext == 'zip' and att.datas:
|
||||||
|
lines.append(_describe_zip(att.datas, name))
|
||||||
|
else:
|
||||||
|
label = _EXT_LABELS.get(ext, 'file')
|
||||||
|
lines.append(f'<b>{name}</b> ({label})')
|
||||||
|
|
||||||
|
file_summary = '<br/>'.join(lines)
|
||||||
|
question = (
|
||||||
|
f'I received the following file(s):<br/>{file_summary}<br/><br/>'
|
||||||
|
'What would you like me to do with them? Some options:<br/>'
|
||||||
|
'• <b>Create an expense report</b> from these receipts<br/>'
|
||||||
|
'• <b>Import products</b> from this data<br/>'
|
||||||
|
'• <b>Something else</b> — just tell me what you need'
|
||||||
|
)
|
||||||
|
self.sudo().message_post(
|
||||||
|
body=question,
|
||||||
|
author_id=bot_partner.id,
|
||||||
|
message_type='comment',
|
||||||
|
subtype_xmlid='mail.mt_comment',
|
||||||
|
)
|
||||||
|
|
||||||
|
def _find_pending_attachments(self, bot_partner):
|
||||||
|
"""
|
||||||
|
Scan the last _LOOKBACK_MESSAGES messages in this channel for the most
|
||||||
|
recent human-sent message that has attachments. Only returns them if
|
||||||
|
the immediately following bot message looks like a clarification question
|
||||||
|
(i.e. the bot hasn't already acted on those files).
|
||||||
|
"""
|
||||||
|
messages = self.message_ids.sorted('date', reverse=True)[:_LOOKBACK_MESSAGES]
|
||||||
|
prev_was_bot_question = False
|
||||||
|
for msg in messages:
|
||||||
|
is_bot = msg.author_id == bot_partner
|
||||||
|
if is_bot:
|
||||||
|
# Check whether this bot message was a clarification question
|
||||||
|
if 'what would you like me to do' in (msg.body or '').lower():
|
||||||
|
prev_was_bot_question = True
|
||||||
|
continue
|
||||||
|
# Human message
|
||||||
|
if msg.attachment_ids and prev_was_bot_question:
|
||||||
|
return msg.attachment_ids
|
||||||
|
prev_was_bot_question = False
|
||||||
|
return self.env['ir.attachment'].browse()
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ class DirectiveContext:
|
|||||||
recent_findings: list = field(default_factory=list)
|
recent_findings: list = field(default_factory=list)
|
||||||
conversation_summary: str = ''
|
conversation_summary: str = ''
|
||||||
peer_data: dict = field(default_factory=dict)
|
peer_data: dict = field(default_factory=dict)
|
||||||
|
receipts: list = field(default_factory=list) # populated by upload flow
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from datetime import date as _date
|
||||||
from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport
|
from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport
|
||||||
from ..tools.expenses_tools import ExpensesTools
|
from ..tools.expenses_tools import ExpensesTools
|
||||||
|
|
||||||
@@ -46,23 +48,27 @@ class ExpensesAgent(BaseAgent):
|
|||||||
def __init__(self, odoo, llm, peer_bus=None):
|
def __init__(self, odoo, llm, peer_bus=None):
|
||||||
super().__init__(odoo, llm, peer_bus)
|
super().__init__(odoo, llm, peer_bus)
|
||||||
self._et = ExpensesTools(odoo)
|
self._et = ExpensesTools(odoo)
|
||||||
self._gathered_data = {}
|
self._gathered_data: dict = {}
|
||||||
self._actions_taken = []
|
self._actions_taken: list = []
|
||||||
self._escalations_list = []
|
self._escalations_list: list = []
|
||||||
|
|
||||||
async def _plan(self, directive: AgentDirective) -> dict:
|
async def _plan(self) -> dict:
|
||||||
intent = (directive.intent or '').lower()
|
task = (self._directive.task if self._directive else '').lower()
|
||||||
|
receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
|
||||||
return {
|
return {
|
||||||
'fetch_summary': any(k in intent for k in ('summary', 'overview', 'report')),
|
'mode': 'create_from_receipts' if receipts else 'read',
|
||||||
'fetch_pending': any(k in intent for k in ('pending', 'approve', 'approval')),
|
'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
|
||||||
'employee_id': directive.context.get('employee_id'),
|
'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
|
||||||
'date_from': directive.context.get('date_from'),
|
'employee_id': self._directive.params.get('employee_id') if self._directive else None,
|
||||||
'date_to': directive.context.get('date_to'),
|
'date_from': self._directive.params.get('date_from') if self._directive else None,
|
||||||
|
'date_to': self._directive.params.get('date_to') if self._directive else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
async def _gather(self, ctx: dict) -> dict:
|
async def _gather(self, plan: dict) -> dict:
|
||||||
plan = ctx.get('plan', {})
|
data: dict = {'mode': plan.get('mode', 'read')}
|
||||||
data: dict = {}
|
if plan.get('mode') == 'create_from_receipts':
|
||||||
|
self._gathered_data = data
|
||||||
|
return data
|
||||||
data['summary'] = await self._et.get_expenses_summary(
|
data['summary'] = await self._et.get_expenses_summary(
|
||||||
date_from=plan.get('date_from'), date_to=plan.get('date_to'),
|
date_from=plan.get('date_from'), date_to=plan.get('date_to'),
|
||||||
)
|
)
|
||||||
@@ -71,9 +77,12 @@ class ExpensesAgent(BaseAgent):
|
|||||||
self._gathered_data = data
|
self._gathered_data = data
|
||||||
return data
|
return data
|
||||||
|
|
||||||
async def _reason(self, ctx: dict) -> dict:
|
async def _reason(self) -> dict:
|
||||||
data = self._gathered_data
|
data = self._gathered_data
|
||||||
analysis: dict = {'escalations': [], 'flags': []}
|
analysis: dict = {'escalations': [], 'flags': []}
|
||||||
|
if data.get('mode') == 'create_from_receipts':
|
||||||
|
self._escalations_list = []
|
||||||
|
return analysis
|
||||||
summary = data.get('summary', {})
|
summary = data.get('summary', {})
|
||||||
if summary.get('pending_approval_count', 0) > 10:
|
if summary.get('pending_approval_count', 0) > 10:
|
||||||
analysis['escalations'].append(
|
analysis['escalations'].append(
|
||||||
@@ -82,23 +91,135 @@ class ExpensesAgent(BaseAgent):
|
|||||||
self._escalations_list = analysis['escalations']
|
self._escalations_list = analysis['escalations']
|
||||||
return analysis
|
return analysis
|
||||||
|
|
||||||
async def _act(self, ctx: dict) -> list:
|
async def _act(self, reasoning: dict) -> list:
|
||||||
return []
|
if self._gathered_data.get('mode') != 'create_from_receipts':
|
||||||
|
return []
|
||||||
|
receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
|
||||||
|
if not receipts:
|
||||||
|
return []
|
||||||
|
|
||||||
async def _report(self, ctx: dict) -> AgentReport:
|
user_id = (self._directive.context.peer_data.get('requesting_user_id')
|
||||||
|
if self._directive else None)
|
||||||
|
employee_id = await self._et.get_employee_id_for_user(user_id)
|
||||||
|
if not employee_id:
|
||||||
|
self._escalations_list.append(
|
||||||
|
'No employee record found for the current user; cannot create expense report.')
|
||||||
|
return []
|
||||||
|
|
||||||
|
sheet_name = f'Expense Report - {_date.today().isoformat()}'
|
||||||
|
sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
|
||||||
|
if not sheet_result.success:
|
||||||
|
self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
|
||||||
|
return []
|
||||||
|
|
||||||
|
sheet_id = sheet_result.record_id
|
||||||
|
actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
|
||||||
|
|
||||||
|
product_id = await self._et.get_default_expense_product()
|
||||||
|
|
||||||
|
for receipt in receipts:
|
||||||
|
parsed = await self._parse_receipt_text(
|
||||||
|
receipt.get('text', ''), receipt.get('filename', 'receipt'))
|
||||||
|
expense_result = await self._et.create_expense(
|
||||||
|
sheet_id=sheet_id,
|
||||||
|
employee_id=employee_id,
|
||||||
|
name=str(parsed.get('vendor', receipt.get('filename', 'Expense')))[:64],
|
||||||
|
total_amount=float(parsed.get('amount', 0.0)),
|
||||||
|
date=str(parsed.get('date') or _date.today().isoformat()),
|
||||||
|
product_id=product_id,
|
||||||
|
description=str(parsed.get('description', '')),
|
||||||
|
)
|
||||||
|
if expense_result.success:
|
||||||
|
actions.append(
|
||||||
|
f"Added: {parsed.get('vendor', 'Unknown vendor')} "
|
||||||
|
f"${float(parsed.get('amount', 0)):.2f} "
|
||||||
|
f"on {parsed.get('date', 'today')}"
|
||||||
|
)
|
||||||
|
if receipt.get('b64'):
|
||||||
|
await self._et.attach_receipt(
|
||||||
|
'hr.expense', expense_result.record_id,
|
||||||
|
receipt.get('filename', 'receipt'),
|
||||||
|
receipt['b64'],
|
||||||
|
receipt.get('mimetype', 'application/octet-stream'),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
actions.append(
|
||||||
|
f"Could not create expense for {receipt.get('filename', 'receipt')}: "
|
||||||
|
f"{expense_result.error}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._actions_taken = actions
|
||||||
|
return actions
|
||||||
|
|
||||||
|
async def _parse_receipt_text(self, text: str, filename: str) -> dict:
|
||||||
|
fallback = {'vendor': filename, 'amount': 0.0,
|
||||||
|
'date': _date.today().isoformat(), 'description': filename}
|
||||||
|
if not text or text.startswith('['):
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
'Extract expense details from the following receipt text. '
|
||||||
|
'Return ONLY valid JSON with these keys: '
|
||||||
|
'"vendor" (string), "amount" (number, the total charged), '
|
||||||
|
'"date" (string YYYY-MM-DD, use today if absent), '
|
||||||
|
'"description" (string, brief expense type).\n\n'
|
||||||
|
f'Receipt text (first 2000 chars):\n{text[:2000]}\n\nJSON only:'
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
resp = await self._llm.submit(
|
||||||
|
[{'role': 'user', 'content': prompt}],
|
||||||
|
caller='expenses_agent_receipt_parser',
|
||||||
|
)
|
||||||
|
raw = (resp.content or '').strip()
|
||||||
|
first, last = raw.find('{'), raw.rfind('}')
|
||||||
|
if first != -1 and last > first:
|
||||||
|
data = json.loads(raw[first:last + 1])
|
||||||
|
return {
|
||||||
|
'vendor': str(data.get('vendor', filename)),
|
||||||
|
'amount': float(data.get('amount', 0.0)),
|
||||||
|
'date': str(data.get('date', _date.today().isoformat())),
|
||||||
|
'description': str(data.get('description', '')),
|
||||||
|
}
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('Receipt parse failed for %s: %s', filename, exc)
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
async def _report(self) -> AgentReport:
|
||||||
data = self._gathered_data
|
data = self._gathered_data
|
||||||
summary = data.get('summary', {})
|
directive_id = self._directive.directive_id if self._directive else ''
|
||||||
|
|
||||||
|
if data.get('mode') == 'create_from_receipts':
|
||||||
|
if self._actions_taken:
|
||||||
|
lines = '\n'.join(f' • {a}' for a in self._actions_taken)
|
||||||
|
summary = (
|
||||||
|
f'Expense report created successfully:\n{lines}\n\n'
|
||||||
|
'The report is in draft. Please open Odoo > Expenses, '
|
||||||
|
'review the entries, and click Submit to send for approval.'
|
||||||
|
)
|
||||||
|
status = 'complete'
|
||||||
|
else:
|
||||||
|
summary = ('Could not create expense report. ' +
|
||||||
|
'; '.join(self._escalations_list or ['Unknown error']))
|
||||||
|
status = 'failed'
|
||||||
|
return AgentReport(
|
||||||
|
directive_id=directive_id, agent=self.name, status=status,
|
||||||
|
summary=summary, data=data,
|
||||||
|
escalations=self._escalations_list, actions_taken=self._actions_taken)
|
||||||
|
|
||||||
|
summary_data = data.get('summary', {})
|
||||||
parts = []
|
parts = []
|
||||||
if summary:
|
if summary_data:
|
||||||
parts.append(
|
parts.append(
|
||||||
f'Expenses: {summary.get("total_expenses", 0)} records, '
|
f'Expenses: {summary_data.get("total_expenses", 0)} records, '
|
||||||
f'total {summary.get("total_amount", 0):.2f}. '
|
f'total ${summary_data.get("total_amount", 0):.2f}. '
|
||||||
f'{summary.get("pending_approval_count", 0)} pending approval.'
|
f'{summary_data.get("pending_approval_count", 0)} pending approval.'
|
||||||
)
|
)
|
||||||
if not parts:
|
if not parts:
|
||||||
parts.append('Expenses review complete.')
|
parts.append('Expenses review complete.')
|
||||||
return AgentReport(agent=self.name, summary=chr(10).join(parts),
|
return AgentReport(
|
||||||
data=data, escalations=self._escalations_list, actions_taken=[])
|
directive_id=directive_id, agent=self.name, status='complete',
|
||||||
|
summary='\n'.join(parts), data=data,
|
||||||
|
escalations=self._escalations_list, actions_taken=[])
|
||||||
|
|
||||||
async def _dispatch_tool(self, name: str, args: dict):
|
async def _dispatch_tool(self, name: str, args: dict):
|
||||||
dispatch = {
|
dispatch = {
|
||||||
@@ -121,7 +242,8 @@ class ExpensesAgent(BaseAgent):
|
|||||||
if req_type == 'expenses_summary':
|
if req_type == 'expenses_summary':
|
||||||
return await self._et.get_expenses_summary()
|
return await self._et.get_expenses_summary()
|
||||||
if req_type == 'employee_expenses':
|
if req_type == 'employee_expenses':
|
||||||
return {'expenses': await self._et.get_expense_by_employee(employee_id=request['employee_id'])}
|
return {'expenses': await self._et.get_expense_by_employee(
|
||||||
|
employee_id=request['employee_id'])}
|
||||||
return {'error': f'Unknown type: {req_type}'}
|
return {'error': f'Unknown type: {req_type}'}
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return {'error': str(exc)}
|
return {'error': str(exc)}
|
||||||
@@ -131,10 +253,15 @@ class ExpensesAgent(BaseAgent):
|
|||||||
try:
|
try:
|
||||||
pending = await self._et.get_pending_approvals()
|
pending = await self._et.get_pending_approvals()
|
||||||
for sheet in pending:
|
for sheet in pending:
|
||||||
findings.append({'type': 'pending_expense_approval', 'sheet_id': sheet.get('id'),
|
emp = sheet.get('employee_id', [0, ''])
|
||||||
'employee': sheet.get('employee_id', [0, ''])[1] if isinstance(sheet.get('employee_id'), list) else '',
|
findings.append({
|
||||||
'amount': sheet.get('total_amount', 0), 'severity': 'low'})
|
'type': 'pending_expense_approval',
|
||||||
|
'sheet_id': sheet.get('id'),
|
||||||
|
'employee': emp[1] if isinstance(emp, list) else '',
|
||||||
|
'amount': sheet.get('total_amount', 0),
|
||||||
|
'severity': 'low',
|
||||||
|
})
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return SweepReport(agent=self.name, findings=[], actions=[], error=str(exc))
|
return SweepReport(agent=self.name, findings=[], error=str(exc))
|
||||||
return SweepReport(agent=self.name, findings=findings, actions=[],
|
return SweepReport(agent=self.name, findings=findings, actions_taken=[],
|
||||||
summary=f'Expenses sweep: {len(findings)} pending approvals.')
|
summary=f'Expenses sweep: {len(findings)} pending approvals.')
|
||||||
|
|||||||
@@ -71,7 +71,8 @@ class MasterAgent:
|
|||||||
# example block, so str.format would treat them as fields.
|
# example block, so str.format would treat them as fields.
|
||||||
return template.replace('{agent_list}', agent_list)
|
return template.replace('{agent_list}', agent_list)
|
||||||
|
|
||||||
async def handle_message(self, user_id, channel_id, message, directive_id) -> MasterResponse:
|
async def handle_message(self, user_id, channel_id, message, directive_id,
|
||||||
|
extra_context: dict = None) -> MasterResponse:
|
||||||
try:
|
try:
|
||||||
user_id = int(user_id)
|
user_id = int(user_id)
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
@@ -107,7 +108,8 @@ class MasterAgent:
|
|||||||
await self._memory.append_message(user_id, 'assistant', msg, directive_id)
|
await self._memory.append_message(user_id, 'assistant', msg, directive_id)
|
||||||
await self._log_directive_complete(directive_id, 'failed', msg)
|
await self._log_directive_complete(directive_id, 'failed', msg)
|
||||||
return MasterResponse(directive_id=directive_id, response=msg, status='failed')
|
return MasterResponse(directive_id=directive_id, response=msg, status='failed')
|
||||||
directives = await self._build_directives(intent, context, directive_id)
|
directives = await self._build_directives(intent, context, directive_id,
|
||||||
|
user_id=user_id, extra_context=extra_context)
|
||||||
reports = await self._dispatch_agents(directives)
|
reports = await self._dispatch_agents(directives)
|
||||||
response_text = await self._synthesize(reports, context)
|
response_text = await self._synthesize(reports, context)
|
||||||
await self._update_memory(user_id, message, response_text, reports, directive_id)
|
await self._update_memory(user_id, message, response_text, reports, directive_id)
|
||||||
@@ -197,20 +199,26 @@ class MasterAgent:
|
|||||||
return AccessResult(allowed=False, denied_agents=denied)
|
return AccessResult(allowed=False, denied_agents=denied)
|
||||||
return AccessResult(allowed=True)
|
return AccessResult(allowed=True)
|
||||||
|
|
||||||
async def _build_directives(self, intent: IntentResult, context: MasterContext, directive_id) -> list:
|
async def _build_directives(self, intent: IntentResult, context: MasterContext, directive_id,
|
||||||
|
user_id=None, extra_context: dict = None) -> list:
|
||||||
|
receipts = (extra_context or {}).get('receipts', [])
|
||||||
directives = []
|
directives = []
|
||||||
for agent_key in intent.agents:
|
for agent_key in intent.agents:
|
||||||
|
authorized = ['read', 'search', 'report', 'post_chatter',
|
||||||
|
'send_email', 'create_non_financial', 'write_non_financial']
|
||||||
|
if receipts:
|
||||||
|
authorized.append('create_expense')
|
||||||
ctx = DirectiveContext(
|
ctx = DirectiveContext(
|
||||||
client_profile=context.knowledge,
|
client_profile=context.knowledge,
|
||||||
recent_findings=context.operational_findings,
|
recent_findings=context.operational_findings,
|
||||||
conversation_summary=chr(10).join(
|
conversation_summary=chr(10).join(
|
||||||
m['content'] for m in context.conversation[-5:] if m['role'] == 'assistant'),
|
m['content'] for m in context.conversation[-5:] if m['role'] == 'assistant'),
|
||||||
peer_data={})
|
peer_data={'requesting_user_id': user_id},
|
||||||
|
receipts=receipts)
|
||||||
d = AgentDirective(
|
d = AgentDirective(
|
||||||
directive_id=directive_id, agent=agent_key, task=intent.intent_summary,
|
directive_id=directive_id, agent=agent_key, task=intent.intent_summary,
|
||||||
params=intent.params, context=ctx,
|
params=intent.params, context=ctx,
|
||||||
authorized_actions=['read', 'search', 'report', 'post_chatter',
|
authorized_actions=authorized,
|
||||||
'send_email', 'create_non_financial', 'write_non_financial'],
|
|
||||||
constraints={'max_amount': 5000})
|
constraints={'max_amount': 5000})
|
||||||
directives.append(d)
|
directives.append(d)
|
||||||
return directives
|
return directives
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|||||||
|
|
||||||
from .config import get_settings
|
from .config import get_settings
|
||||||
from . import app_state
|
from . import app_state
|
||||||
from .routers import dispatch, approval, registry, sweep, health
|
from .routers import dispatch, approval, registry, sweep, health, upload
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -224,6 +224,7 @@ def create_app() -> FastAPI:
|
|||||||
allow_headers=['*'],
|
allow_headers=['*'],
|
||||||
)
|
)
|
||||||
app.include_router(dispatch.router)
|
app.include_router(dispatch.router)
|
||||||
|
app.include_router(upload.router)
|
||||||
app.include_router(approval.router)
|
app.include_router(approval.router)
|
||||||
app.include_router(registry.router)
|
app.include_router(registry.router)
|
||||||
app.include_router(sweep.router)
|
app.include_router(sweep.router)
|
||||||
|
|||||||
80
agent_service/routers/upload.py
Normal file
80
agent_service/routers/upload.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile, status
|
||||||
|
|
||||||
|
from ..config import get_settings
|
||||||
|
from .dispatch import DispatchResponse, _check_rate_limit, _verify_webhook_secret
|
||||||
|
from ..tools.receipt_parser import parse_upload
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix='/upload', tags=['upload'])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post('', response_model=DispatchResponse)
|
||||||
|
async def upload(
|
||||||
|
request: Request,
|
||||||
|
user_id: str = Form(...),
|
||||||
|
message: str = Form(default='Create an employee expense report from these receipts.'),
|
||||||
|
session_id: Optional[str] = Form(default=None),
|
||||||
|
files: List[UploadFile] = File(default=[]),
|
||||||
|
):
|
||||||
|
_verify_webhook_secret(request)
|
||||||
|
_check_rate_limit(user_id)
|
||||||
|
|
||||||
|
from ..app_state import get_master_agent
|
||||||
|
master = get_master_agent()
|
||||||
|
if master is None:
|
||||||
|
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||||
|
detail='Agent service not ready')
|
||||||
|
|
||||||
|
receipts: list[dict] = []
|
||||||
|
for f in files:
|
||||||
|
data = await f.read()
|
||||||
|
filename = f.filename or 'receipt'
|
||||||
|
try:
|
||||||
|
parsed = parse_upload(filename, data)
|
||||||
|
receipts.extend(parsed)
|
||||||
|
logger.info('upload: parsed %s → %d receipt(s)', filename, len(parsed))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('upload: parse failed for %s: %s', filename, exc)
|
||||||
|
|
||||||
|
if not receipts:
|
||||||
|
logger.warning('upload: no parseable receipts found in upload from user_id=%s', user_id)
|
||||||
|
|
||||||
|
directive_id = session_id or uuid.uuid4().hex
|
||||||
|
extra_context = {'receipts': receipts, 'user_id': user_id}
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
timeout = settings.directive_timeout_minutes * 60
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await asyncio.wait_for(
|
||||||
|
master.handle_message(
|
||||||
|
user_id=user_id,
|
||||||
|
channel_id=None,
|
||||||
|
message=message,
|
||||||
|
directive_id=directive_id,
|
||||||
|
extra_context=extra_context,
|
||||||
|
),
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_504_GATEWAY_TIMEOUT,
|
||||||
|
detail=f'Directive timed out after {settings.directive_timeout_minutes}m',
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception('upload error user=%s: %s', user_id, exc)
|
||||||
|
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc))
|
||||||
|
|
||||||
|
return DispatchResponse(
|
||||||
|
directive_id=response.directive_id,
|
||||||
|
reply=response.response,
|
||||||
|
escalations=response.escalations,
|
||||||
|
actions_taken=response.actions_taken,
|
||||||
|
session_id=session_id,
|
||||||
|
)
|
||||||
@@ -84,3 +84,64 @@ class ExpensesTools:
|
|||||||
async def post_chatter_note(self, model: str, record_id: int, note: str) -> bool:
|
async def post_chatter_note(self, model: str, record_id: int, note: str) -> bool:
|
||||||
await self._o.call(model, 'message_post', [[record_id]], {'body': note, 'message_type': 'comment'})
|
await self._o.call(model, 'message_post', [[record_id]], {'body': note, 'message_type': 'comment'})
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
async def get_employee_id_for_user(self, user_id) -> int | None:
|
||||||
|
if not user_id:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
records = await self._o.search_read(
|
||||||
|
'hr.employee', [('user_id', '=', int(user_id))], ['id', 'name'], limit=1)
|
||||||
|
return records[0]['id'] if records else None
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('get_employee_id_for_user failed user_id=%s: %s', user_id, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def get_default_expense_product(self) -> int | None:
|
||||||
|
try:
|
||||||
|
records = await self._o.search_read(
|
||||||
|
'product.product',
|
||||||
|
[('can_be_expensed', '=', True), ('type', '=', 'service')],
|
||||||
|
['id', 'name'], limit=1)
|
||||||
|
return records[0]['id'] if records else None
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('get_default_expense_product failed: %s', exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def create_expense_sheet(self, name: str, employee_id: int):
|
||||||
|
return await self._o.create('hr.expense.sheet', {
|
||||||
|
'name': name,
|
||||||
|
'employee_id': employee_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
async def create_expense(self, sheet_id: int, employee_id: int, name: str,
|
||||||
|
total_amount: float, date: str, product_id: int = None,
|
||||||
|
description: str = ''):
|
||||||
|
vals: dict = {
|
||||||
|
'name': name,
|
||||||
|
'employee_id': employee_id,
|
||||||
|
'sheet_id': sheet_id,
|
||||||
|
'total_amount': total_amount,
|
||||||
|
'quantity': 1.0,
|
||||||
|
}
|
||||||
|
if date:
|
||||||
|
vals['date'] = date
|
||||||
|
if product_id:
|
||||||
|
vals['product_id'] = product_id
|
||||||
|
if description:
|
||||||
|
vals['description'] = description
|
||||||
|
return await self._o.create('hr.expense', vals)
|
||||||
|
|
||||||
|
async def attach_receipt(self, model: str, record_id: int, filename: str,
|
||||||
|
file_b64: str, mimetype: str) -> bool:
|
||||||
|
try:
|
||||||
|
await self._o.create('ir.attachment', {
|
||||||
|
'name': filename,
|
||||||
|
'datas': file_b64,
|
||||||
|
'res_model': model,
|
||||||
|
'res_id': record_id,
|
||||||
|
'mimetype': mimetype,
|
||||||
|
})
|
||||||
|
return True
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('attach_receipt failed %s/%s: %s', model, record_id, exc)
|
||||||
|
return False
|
||||||
|
|||||||
133
agent_service/tools/receipt_parser.py
Normal file
133
agent_service/tools/receipt_parser.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_MIME = {
|
||||||
|
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
|
||||||
|
'.png': 'image/png', '.gif': 'image/gif',
|
||||||
|
'.bmp': 'image/bmp', '.tiff': 'image/tiff', '.tif': 'image/tiff',
|
||||||
|
'.webp': 'image/webp', '.pdf': 'application/pdf',
|
||||||
|
'.html': 'text/html', '.htm': 'text/html',
|
||||||
|
'.txt': 'text/plain', '.zip': 'application/zip',
|
||||||
|
}
|
||||||
|
|
||||||
|
_IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp'}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_upload(filename: str, data: bytes) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Parse one uploaded file into a list of receipt dicts.
|
||||||
|
ZIP files are recursively unpacked; all other types return a single entry.
|
||||||
|
Each dict: {filename, text, b64, mimetype}
|
||||||
|
"""
|
||||||
|
ext = Path(filename).suffix.lower()
|
||||||
|
if ext == '.zip':
|
||||||
|
return _extract_zip(filename, data)
|
||||||
|
|
||||||
|
b64 = base64.b64encode(data).decode()
|
||||||
|
mimetype = _MIME.get(ext, 'application/octet-stream')
|
||||||
|
|
||||||
|
if ext in _IMAGE_EXTS:
|
||||||
|
text = _ocr_image(data, filename)
|
||||||
|
elif ext == '.pdf':
|
||||||
|
text = _extract_pdf(data, filename)
|
||||||
|
elif ext in ('.html', '.htm'):
|
||||||
|
text = _extract_html(data, filename)
|
||||||
|
elif ext == '.txt':
|
||||||
|
text = data.decode('utf-8', errors='replace')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
text = data.decode('utf-8', errors='replace')
|
||||||
|
except Exception:
|
||||||
|
text = f'[Binary file: {filename}]'
|
||||||
|
|
||||||
|
return [{'filename': filename, 'text': text, 'b64': b64, 'mimetype': mimetype}]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_zip(zip_filename: str, data: bytes) -> list[dict]:
|
||||||
|
results = []
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
||||||
|
for member in zf.namelist():
|
||||||
|
if member.endswith('/'):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
member_data = zf.read(member)
|
||||||
|
results.extend(parse_upload(Path(member).name, member_data))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('receipt_parser: zip member %s failed: %s', member, exc)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error('receipt_parser: zip %s failed: %s', zip_filename, exc)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_image(data: bytes, filename: str) -> str:
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
img = Image.open(io.BytesIO(data))
|
||||||
|
return pytesseract.image_to_string(img).strip()
|
||||||
|
except ImportError:
|
||||||
|
logger.warning('pytesseract/Pillow not installed — OCR unavailable for %s', filename)
|
||||||
|
return f'[Image: {filename} — install pytesseract+Pillow for OCR]'
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('OCR failed for %s: %s', filename, exc)
|
||||||
|
return f'[Image: {filename} — OCR failed: {exc}]'
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf(data: bytes, filename: str) -> str:
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
parts = []
|
||||||
|
with pdfplumber.open(io.BytesIO(data)) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
t = page.extract_text()
|
||||||
|
if t:
|
||||||
|
parts.append(t)
|
||||||
|
return '\n'.join(parts).strip()
|
||||||
|
except ImportError:
|
||||||
|
logger.warning('pdfplumber not installed — PDF extraction unavailable for %s', filename)
|
||||||
|
return f'[PDF: {filename} — install pdfplumber for text extraction]'
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('PDF extraction failed for %s: %s', filename, exc)
|
||||||
|
return f'[PDF: {filename} — extraction failed: {exc}]'
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_html(data: bytes, filename: str) -> str:
|
||||||
|
try:
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
class _TextExtractor(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._parts: list[str] = []
|
||||||
|
self._skip = False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag in ('script', 'style'):
|
||||||
|
self._skip = True
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag in ('script', 'style'):
|
||||||
|
self._skip = False
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if not self._skip:
|
||||||
|
s = data.strip()
|
||||||
|
if s:
|
||||||
|
self._parts.append(s)
|
||||||
|
|
||||||
|
def text(self):
|
||||||
|
return ' '.join(self._parts)
|
||||||
|
|
||||||
|
parser = _TextExtractor()
|
||||||
|
parser.feed(data.decode('utf-8', errors='replace'))
|
||||||
|
return parser.text()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning('HTML extraction failed for %s: %s', filename, exc)
|
||||||
|
return f'[HTML: {filename} — extraction failed: {exc}]'
|
||||||
@@ -11,3 +11,8 @@ json-log-formatter==0.5.2
|
|||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
mcp==1.3.0
|
mcp==1.3.0
|
||||||
ollama==0.3.3
|
ollama==0.3.3
|
||||||
|
# Receipt parsing — also requires: apt install tesseract-ocr (for image OCR)
|
||||||
|
pdfplumber==0.11.4
|
||||||
|
Pillow==10.4.0
|
||||||
|
pytesseract==0.3.13
|
||||||
|
python-multipart==0.0.12
|
||||||
|
|||||||
Reference in New Issue
Block a user