odoo-ai/agent_service/agents/expenses_agent.py

from __future__ import annotations
import asyncio
import difflib
import json
import logging
from datetime import date as _date
from .base_agent import BaseAgent, AgentReport, AgentDirective, SweepReport
from ..tools.expenses_tools import ExpensesTools

logger = logging.getLogger(__name__)

EXPENSES_TOOLS = [
    {'name': 'get_expenses', 'description': 'Retrieve expense records',
     'parameters': {'employee_id': {'type': 'integer', 'optional': True},
                    'state': {'type': 'string', 'optional': True},
                    'date_from': {'type': 'string', 'optional': True},
                    'date_to': {'type': 'string', 'optional': True},
                    'limit': {'type': 'integer', 'optional': True}}},
    {'name': 'get_expense_sheets', 'description': 'Get expense report sheets',
     'parameters': {'state': {'type': 'string', 'optional': True},
                    'employee_id': {'type': 'integer', 'optional': True},
                    'limit': {'type': 'integer', 'optional': True}}},
    {'name': 'get_pending_approvals', 'description': 'Get expense sheets pending approval',
     'parameters': {}},
    {'name': 'approve_expense_sheet', 'description': 'Approve an expense sheet',
     'parameters': {'sheet_id': {'type': 'integer'}}},
    {'name': 'get_expenses_summary', 'description': 'Get expense summary for a period',
     'parameters': {'date_from': {'type': 'string', 'optional': True},
                    'date_to': {'type': 'string', 'optional': True}}},
    {'name': 'get_expense_by_employee', 'description': 'Get expenses for a specific employee',
     'parameters': {'employee_id': {'type': 'integer'},
                    'limit': {'type': 'integer', 'optional': True}}},
    {'name': 'flag_for_review', 'description': 'Flag an expense for review',
     'parameters': {'model': {'type': 'string'}, 'record_id': {'type': 'integer'},
                    'reason': {'type': 'string'},
                    'severity': {'type': 'string', 'optional': True}}},
    {'name': 'post_chatter_note', 'description': 'Post a note on a record',
     'parameters': {'model': {'type': 'string'}, 'record_id': {'type': 'integer'},
                    'note': {'type': 'string'}}},
]


class ExpensesAgent(BaseAgent):
    name = 'expenses_agent'
    domain = 'expenses'
    required_odoo_module = 'hr_expense'
    system_prompt_file = 'expenses_system.txt'
    tools = EXPENSES_TOOLS

    def __init__(self, odoo, llm, peer_bus=None):
        super().__init__(odoo, llm, peer_bus)
        self._et = ExpensesTools(odoo)
        self._gathered_data: dict = {}
        self._actions_taken: list = []
        self._escalations_list: list = []

    async def _plan(self) -> dict:
        task = (self._directive.task if self._directive else '').lower()
        receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []

        # The master LLM rewrites the user message into intent_summary (task).
        # Also check the original raw_message threaded through peer_data so
        # short replies like "skip duplicates" are detected even when rewritten.
        raw_msg = ''
        if self._directive and self._directive.context:
            raw_msg = (self._directive.context.peer_data.get('raw_message') or '').lower()
        combined = task + ' ' + raw_msg

        # Detect whether the user is responding to a duplicate-approval request
        skip_keywords = ('skip', 'remove duplicate', 'exclude duplicate', 'drop duplicate')
        keep_keywords = ('keep all', 'keep both', 'include all', 'no skip', "don't skip")
        confirm_keywords = ('confirm', 'looks good', 'go ahead', 'proceed', 'create it', 'create them')
        if any(k in combined for k in keep_keywords):
            user_dup_decision = 'keep_all'
        elif any(k in combined for k in skip_keywords):
            user_dup_decision = 'skip'
        else:
            user_dup_decision = 'skip'  # default: skip duplicates when confirmed

        user_confirmed = any(k in combined for k in confirm_keywords)

        return {
            'mode': 'create_from_receipts' if receipts else 'read',
            'user_dup_decision': user_dup_decision,
            'user_confirmed': user_confirmed,
            'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
            'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
            'employee_id': self._directive.params.get('employee_id') if self._directive else None,
            'date_from': self._directive.params.get('date_from') if self._directive else None,
            'date_to': self._directive.params.get('date_to') if self._directive else None,
        }

    async def _gather(self, plan: dict) -> dict:
        data: dict = {'mode': plan.get('mode', 'read'),
                      'user_dup_decision': plan.get('user_dup_decision', 'skip'),
                      'user_confirmed': plan.get('user_confirmed', False)}
        if plan.get('mode') == 'create_from_receipts':
            self._gathered_data = data
            return data
        data['summary'] = await self._et.get_expenses_summary(
            date_from=plan.get('date_from'), date_to=plan.get('date_to'),
        )
        if plan.get('fetch_pending'):
            data['pending'] = await self._et.get_pending_approvals()
        self._gathered_data = data
        return data

    async def _reason(self) -> dict:
        data = self._gathered_data
        analysis: dict = {'escalations': [], 'flags': []}
        if data.get('mode') == 'create_from_receipts':
            self._escalations_list = []
            return analysis
        summary = data.get('summary', {})
        if summary.get('pending_approval_count', 0) > 10:
            analysis['escalations'].append(
                f'{summary["pending_approval_count"]} expense sheets pending approval.'
            )
        self._escalations_list = analysis['escalations']
        return analysis

    async def _act(self, reasoning: dict) -> list:
        if self._gathered_data.get('mode') != 'create_from_receipts':
            return []
        receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
        if not receipts:
            return []

        user_dup_decision = self._gathered_data.get('user_dup_decision', 'skip')
        user_confirmed = self._gathered_data.get('user_confirmed', False)

        user_id = (self._directive.context.peer_data.get('requesting_user_id')
                   if self._directive else None)
        employee_id = await self._et.get_employee_id_for_user(user_id)
        if not employee_id:
            self._escalations_list.append(
                'No employee record found for the current user; cannot create expense report.')
            return []

        expense_products = await self._et.get_expense_products()
        default_product_id = expense_products[0]['id'] if expense_products else None
        product_map = {p['id']: p['name'] for p in expense_products}

        # Pass 1: byte-exact dedup
        seen_hashes: set = set()
        unique_receipts = []
        for r in receipts:
            h = r.get('sha256')
            if h and h in seen_hashes:
                logger.info('expenses_agent: skipping byte-identical receipt %s', r.get('filename'))
                continue
            if h:
                seen_hashes.add(h)
            unique_receipts.append(r)

        # Parse all receipts concurrently
        parse_tasks = [
            self._parse_receipt_text(
                r.get('text', ''), r.get('filename', 'receipt'),
                expense_products=expense_products,
                date_hint=r.get('date_from_name'),
            )
            for r in unique_receipts
        ]
        raw_parsed = await asyncio.gather(*parse_tasks, return_exceptions=True)

        paired: list[tuple[dict, dict]] = []
        for receipt, parsed in zip(unique_receipts, raw_parsed):
            if isinstance(parsed, Exception):
                logger.warning('expenses_agent: parse failed for %s: %s',
                               receipt.get('filename'), parsed)
                parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
                          'date': receipt.get('date_from_name') or _date.today().isoformat(),
                          'time': None, 'product_name': ''}
            paired.append((receipt, parsed))

        # Pass 2: semantic dedup
        deduped: list[tuple[dict, dict]] = []
        dup_indices: set[int] = set()  # indices into `paired` that are duplicates
        for i, (receipt, parsed) in enumerate(paired):
            dup_idx = self._find_semantic_duplicate(parsed, deduped)
            if dup_idx is not None:
                dup_indices.add(i)
                if len(receipt.get('text', '')) > len(deduped[dup_idx][0].get('text', '')):
                    deduped[dup_idx] = (receipt, parsed)
            else:
                deduped.append((receipt, parsed))

        # Auto-skip semantic duplicates by default; keep_all only if user explicitly asked.
        # Receipts are only available in this single /upload request — there is no
        # persistent receipt store across turns, so a "confirm then create" flow would
        # always fail on the follow-up turn (no receipts in context).  Creating
        # immediately in draft state is the correct approach: users review and
        # submit inside Odoo > Expenses.
        n_skipped = len(paired) - len(deduped)
        self._gathered_data['n_skipped'] = n_skipped
        final_list = paired if user_dup_decision == 'keep_all' else deduped

        sheet_name = f'Expense Report - {_date.today().isoformat()}'
        sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
        if not sheet_result.success:
            self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
            return []

        sheet_id = sheet_result.record_id
        actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']

        for receipt, parsed in final_list:
            product_id = default_product_id
            chosen_name = parsed.get('product_name', '')
            if chosen_name:
                for p in expense_products:
                    if p['name'].lower() == chosen_name.lower():
                        product_id = p['id']
                        break

            expense_result = await self._et.create_expense(
                sheet_id=sheet_id,
                employee_id=employee_id,
                name=str(parsed.get('vendor', receipt.get('filename', 'Expense')))[:64],
                total_amount=float(parsed.get('amount', 0.0)),
                date=str(parsed.get('date') or _date.today().isoformat()),
                product_id=product_id,
            )
            if expense_result.success:
                cat = product_map.get(product_id, 'Expense')
                actions.append(
                    f"Added: {parsed.get('vendor', 'Unknown vendor')} "
                    f"${float(parsed.get('amount', 0)):.2f} "
                    f"({cat}) on {parsed.get('date', 'today')}"
                )
                if receipt.get('b64'):
                    await self._et.attach_receipt(
                        'hr.expense', expense_result.record_id,
                        receipt.get('filename', 'receipt'),
                        receipt['b64'],
                        receipt.get('mimetype', 'application/octet-stream'),
                    )
            else:
                actions.append(
                    f"Could not create expense for {receipt.get('filename', 'receipt')}: "
                    f"{expense_result.error}"
                )

        self._actions_taken = actions
        return actions

    @staticmethod
    def _find_semantic_duplicate(parsed: dict, candidates: list) -> int | None:
        """
        Return the index in `candidates` of a receipt that appears to be the
        same physical receipt as `parsed`, or None if no match found.

        Pass 1 — exact-amount match (all must pass):
          1. Same date
          2. Amount > 0 and within $0.05 of each other
          3. Transaction times within 30 min (if both present)
          4. Vendor similarity >= 60 %  (or both vendors are raw filenames)

        Pass 2 — OCR-error match (amount may differ due to misread):
          1. Same date
          2. Both amounts > 0
          3. Vendor similarity >= 80 % (stricter threshold compensates for loose amount)
          4. Times within 30 min (if both present)
        """
        amt = float(parsed.get('amount', 0))
        date = parsed.get('date', '')
        time = parsed.get('time')  # HH:MM or None
        vendor = str(parsed.get('vendor', '')).lower().strip()
        is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))

        def _times_compatible(t1, t2) -> bool:
            """Return False only when both times are present and >30 min apart."""
            if not (t1 and t2):
                return True
            try:
                h1, m1 = (int(p) for p in t1.split(':')[:2])
                h2, m2 = (int(p) for p in t2.split(':')[:2])
                return abs((h1 * 60 + m1) - (h2 * 60 + m2)) <= 30
            except Exception:
                return True

        # Pass 1: amount must match within $0.05
        for idx, (_, other) in enumerate(candidates):
            other_amt = float(other.get('amount', 0))
            if amt == 0 or other_amt == 0:
                continue
            if abs(amt - other_amt) > 0.05:
                continue
            if date != other.get('date', ''):
                continue
            if not _times_compatible(time, other.get('time')):
                continue
            other_vendor = str(other.get('vendor', '')).lower().strip()
            other_is_filename = other_vendor.endswith(
                ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
            if is_filename or other_is_filename:
                return idx
            if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.6:
                return idx

        # Pass 2: same vendor + same date even when amounts differ (OCR misread)
        if not is_filename:
            for idx, (_, other) in enumerate(candidates):
                other_amt = float(other.get('amount', 0))
                if amt == 0 or other_amt == 0:
                    continue
                if date != other.get('date', ''):
                    continue
                if not _times_compatible(time, other.get('time')):
                    continue
                other_vendor = str(other.get('vendor', '')).lower().strip()
                if other_vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')):
                    continue
                if difflib.SequenceMatcher(None, vendor, other_vendor).ratio() >= 0.80:
                    return idx

        return None

    @staticmethod
    def _match_category(category: str, expense_products: list) -> str:
        """Map a vision-model category label to the nearest expense product name.

        Tries exact/substring match first, then a fuzzy SequenceMatcher pass.
        Returns empty string when no reasonable match is found.
        """
        if not expense_products or not category:
            return ''
        cat = category.lower().strip()
        # Exact or substring match
        for p in expense_products:
            name = p['name'].lower()
            if cat == name or cat in name or name in cat:
                return p['name']
        # Fuzzy fallback (ratio >= 0.4)
        names_lower = [p['name'].lower() for p in expense_products]
        matches = difflib.get_close_matches(cat, names_lower, n=1, cutoff=0.4)
        if matches:
            for p in expense_products:
                if p['name'].lower() == matches[0]:
                    return p['name']
        return ''

    async def _parse_receipt_text(self, text: str, filename: str,
                                   expense_products: list = None,
                                   date_hint: str = None) -> dict:
        today = _date.today().isoformat()
        fallback = {'vendor': filename, 'amount': 0.0,
                    'date': date_hint or today, 'time': None, 'product_name': ''}

        # ── Fast path: vision model already returned structured JSON ──────────
        # receipt_parser._ocr_image_vision() returns a JSON string directly
        # when a vision model is configured.  Skip the second LLM call entirely.
        stripped = (text or '').strip()
        if stripped.startswith('{'):
            try:
                data = json.loads(stripped)
                if 'amount' in data:
                    logger.debug('expenses_agent: using vision pre-extracted JSON for %s', filename)
                    # Map the vision category label → expense product name
                    product_name = self._match_category(
                        data.get('category', ''), expense_products or [])
                    # Vision model sometimes returns the string "null" instead
                    # of JSON null — normalise both fields.
                    _NULL = (None, 'null', 'None', '')
                    raw_time = data.get('time')
                    time_val = None if raw_time in _NULL else str(raw_time)
                    raw_date = data.get('date')
                    date_val = None if raw_date in _NULL else str(raw_date)
                    return {
                        'vendor': str(data.get('vendor') or filename),
                        'amount': float(data.get('amount', 0.0)),
                        'date': date_val or date_hint or today,
                        'time': time_val,
                        'product_name': product_name,
                    }
            except (json.JSONDecodeError, ValueError, TypeError):
                pass  # not clean JSON — fall through to LLM path

        ocr_failed = not stripped or stripped.startswith('[')

        product_list = ''
        if expense_products:
            names = [p['name'] for p in expense_products]
            product_list = ', '.join(f'"{n}"' for n in names)

        if ocr_failed:
            # No OCR text — still try to classify category from filename/date
            if not product_list:
                return fallback
            prompt = (
                f'A receipt photo named "{filename}" could not be read by OCR. '
                f'Based only on the filename, pick the most likely expense category '
                f'from this list: [{product_list}]. '
                f'Return ONLY valid JSON: {{"product_name": "..."}}'
            )
        else:
            # Keep both the header (vendor/date) and footer (totals) of the receipt.
            # A plain [:N] cut discards the bottom of long receipts where the grand
            # total lives — the primary cause of amount=0 extraction errors.
            if len(stripped) > 3000:
                receipt_text = stripped[:1500] + '\n[...]\n' + stripped[-1500:]
            else:
                receipt_text = stripped

            # When the filename carries a reliable timestamp, inject it directly
            # so the LLM doesn't try to read (and potentially misread) the date
            # from garbled OCR text.
            if date_hint:
                date_instruction = (
                    f'Use exactly "{date_hint}" — this date was read from the file '
                    f'timestamp and is more reliable than the OCR text.'
                )
            else:
                date_instruction = (
                    f'Extract from the receipt text in YYYY-MM-DD format; '
                    f'use {today} only if no date is visible.'
                )

            prompt = (
                'You are a receipt data extractor. '
                'Copy values EXACTLY as they appear in the text — '
                'do NOT guess, infer, "correct" OCR errors, or invent plausible values.\n\n'
                'Return ONLY valid JSON with these keys:\n'
                f'"vendor": merchant name exactly as printed; '
                f'empty string "" if you cannot find it clearly,\n'
                f'"amount": the FINAL total — find a line labeled "Total", "Grand Total", '
                f'"Amount Due", or "Balance Due"; copy the number exactly as written; '
                f'never use subtotal, tax, or tip lines; '
                f'return 0 if no clearly labeled final total is present,\n'
                f'"date": {date_instruction}\n'
                f'"time": transaction time HH:MM (24-hour) exactly as printed, or null,\n'
                f'"product_name": best match from [{product_list}] or "".\n\n'
                f'IMPORTANT: This text came from OCR and may contain garbled characters. '
                f'If a value looks corrupted, return the safe default (0 / "" / null) '
                f'rather than substituting a "more logical" value.\n\n'
                f'Receipt text:\n{receipt_text}\n\nJSON only:'
            )
        try:
            resp = await self._llm.submit(
                [{'role': 'user', 'content': prompt}],
                caller='expenses_agent_receipt_parser',
            )
            raw = (resp.content or '').strip()
            first, last = raw.find('{'), raw.rfind('}')
            if first != -1 and last > first:
                data = json.loads(raw[first:last + 1])
                return {
                    'vendor': str(data.get('vendor', filename)),
                    'amount': float(data.get('amount', 0.0)),
                    'date': str(data.get('date') or date_hint or today),
                    'time': data.get('time') or None,
                    'product_name': str(data.get('product_name', '')),
                }
        except Exception as exc:
            logger.warning('Receipt parse failed for %s: %s', filename, exc)
        return fallback

    async def _report(self) -> AgentReport:
        data = self._gathered_data
        directive_id = self._directive.directive_id if self._directive else ''

        if data.get('mode') == 'create_from_receipts':
            if self._actions_taken:
                lines = '\n'.join(f'  • {a}' for a in self._actions_taken)
                n_skipped = data.get('n_skipped', 0)
                dup_note = f'\n({n_skipped} duplicate receipt(s) were automatically skipped.)' if n_skipped else ''
                summary = (
                    f'Expense report created successfully:\n{lines}{dup_note}\n\n'
                    'The report is in draft — open Odoo › Expenses, '
                    'review the amounts, and click Submit to send for approval.'
                )
                status = 'complete'
            else:
                summary = ('Could not create expense report. ' +
                           '; '.join(self._escalations_list or ['Unknown error']))
                status = 'failed'
            return AgentReport(
                directive_id=directive_id, agent=self.name, status=status,
                summary=summary, data=data,
                escalations=self._escalations_list, actions_taken=self._actions_taken)

        summary_data = data.get('summary', {})
        parts = []
        if summary_data:
            parts.append(
                f'Expenses: {summary_data.get("total_expenses", 0)} records, '
                f'total ${summary_data.get("total_amount", 0):.2f}. '
                f'{summary_data.get("pending_approval_count", 0)} pending approval.'
            )
        if not parts:
            parts.append('Expenses review complete.')
        return AgentReport(
            directive_id=directive_id, agent=self.name, status='complete',
            summary='\n'.join(parts), data=data,
            escalations=self._escalations_list, actions_taken=[])

    async def _dispatch_tool(self, name: str, args: dict):
        dispatch = {
            'get_expenses': self._et.get_expenses,
            'get_expense_sheets': self._et.get_expense_sheets,
            'get_pending_approvals': self._et.get_pending_approvals,
            'approve_expense_sheet': self._et.approve_expense_sheet,
            'get_expenses_summary': self._et.get_expenses_summary,
            'get_expense_by_employee': self._et.get_expense_by_employee,
            'flag_for_review': self._et.flag_for_review,
            'post_chatter_note': self._et.post_chatter_note,
        }
        if name not in dispatch:
            raise ValueError(f'Unknown tool: {name}')
        return await dispatch[name](**args)

    async def handle_peer_request(self, request_type: str, params: dict, directive_id: str) -> dict:
        try:
            if request_type == 'expenses_summary':
                return await self._et.get_expenses_summary()
            if request_type == 'employee_expenses':
                return {'expenses': await self._et.get_expense_by_employee(
                    employee_id=params['employee_id'])}
            return {'error': f'Unknown type: {request_type}'}
        except Exception as exc:
            return {'error': str(exc)}

    async def sweep(self) -> SweepReport:
        findings = []
        try:
            pending = await self._et.get_pending_approvals()
            for sheet in pending:
                emp = sheet.get('employee_id', [0, ''])
                findings.append({
                    'type': 'pending_expense_approval',
                    'sheet_id': sheet.get('id'),
                    'employee': emp[1] if isinstance(emp, list) else '',
                    'amount': sheet.get('total_amount', 0),
                    'severity': 'low',
                })
        except Exception as exc:
            return SweepReport(agent=self.name, findings=[], error=str(exc))
        return SweepReport(agent=self.name, findings=findings, actions_taken=[],
                           summary=f'Expenses sweep: {len(findings)} pending approvals.')