Add duplicate approval flow with time-based dedup

- expenses_agent: extract transaction time (HH:MM) from OCR receipt text - expenses_agent: _find_semantic_duplicate uses time to rule out false positives (>30 min apart = different receipts) - expenses_agent: pause when duplicates found, set mode=awaiting_dup_approval, ask user before creating sheet - expenses_agent: _report formats approval message listing each dup pair with vendor/amount/date/times/filenames - ab_ai_mail: _find_pending_attachments recognises dup-approval bot message so ZIP re-attaches on user reply Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 02:07:37 -04:00
parent f90a2ee863
commit 462f63d11d
2 changed files with 105 additions and 35 deletions
--- a/addons/activeblue_ai/models/ab_ai_mail.py
+++ b/addons/activeblue_ai/models/ab_ai_mail.py
@@ -191,12 +191,18 @@ class DiscussChannel(models.Model):
        (i.e. the bot hasn't already acted on those files).
        """
        messages = self.message_ids.sorted('date', reverse=True)[:_LOOKBACK_MESSAGES]
+        _bot_question_phrases = (
+            'what would you like me to do',
+            'suspected duplicate',
+            'skip duplicates',
+            'keep all',
+        )
        prev_was_bot_question = False
        for msg in messages:
            is_bot = msg.author_id == bot_partner
            if is_bot:
-                # Check whether this bot message was a clarification question
-                if 'what would you like me to do' in (msg.body or '').lower():
+                body_lower = (msg.body or '').lower()
+                if any(p in body_lower for p in _bot_question_phrases):
                    prev_was_bot_question = True
                continue
            # Human message
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -57,8 +57,20 @@ class ExpensesAgent(BaseAgent):
    async def _plan(self) -> dict:
        task = (self._directive.task if self._directive else '').lower()
        receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
+
+        # Detect whether the user is responding to a duplicate-approval request
+        skip_keywords = ('skip', 'yes', 'remove duplicate', 'exclude duplicate', 'drop duplicate')
+        keep_keywords = ('keep all', 'keep both', 'include all', 'no skip', "don't skip")
+        if any(k in task for k in skip_keywords):
+            user_dup_decision = 'skip'
+        elif any(k in task for k in keep_keywords):
+            user_dup_decision = 'keep_all'
+        else:
+            user_dup_decision = 'none'  # first time through — will ask if dups found
+
        return {
            'mode': 'create_from_receipts' if receipts else 'read',
+            'user_dup_decision': user_dup_decision,
            'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
            'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
            'employee_id': self._directive.params.get('employee_id') if self._directive else None,
@@ -67,7 +79,8 @@ class ExpensesAgent(BaseAgent):
        }

    async def _gather(self, plan: dict) -> dict:
-        data: dict = {'mode': plan.get('mode', 'read')}
+        data: dict = {'mode': plan.get('mode', 'read'),
+                      'user_dup_decision': plan.get('user_dup_decision', 'none')}
        if plan.get('mode') == 'create_from_receipts':
            self._gathered_data = data
            return data
@@ -100,6 +113,8 @@ class ExpensesAgent(BaseAgent):
        if not receipts:
            return []

+        user_dup_decision = self._gathered_data.get('user_dup_decision', 'none')
+
        user_id = (self._directive.context.peer_data.get('requesting_user_id')
                   if self._directive else None)
        employee_id = await self._et.get_employee_id_for_user(user_id)
@@ -108,30 +123,20 @@ class ExpensesAgent(BaseAgent):
                'No employee record found for the current user; cannot create expense report.')
            return []

-        sheet_name = f'Expense Report - {_date.today().isoformat()}'
-        sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
-        if not sheet_result.success:
-            self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
-            return []
-
-        sheet_id = sheet_result.record_id
-        actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
-
        # Fetch all expensable products once for category selection
        expense_products = await self._et.get_expense_products()
        default_product_id = expense_products[0]['id'] if expense_products else None
        product_map = {p['id']: p['name'] for p in expense_products}

-        # Deduplicate receipts by SHA256 hash — same image uploaded twice
+        # Pass 1: byte-exact dedup (same file uploaded twice)
        seen_hashes: set = set()
        unique_receipts = []
        for r in receipts:
            h = r.get('sha256')
+            if h and h in seen_hashes:
+                logger.info('expenses_agent: skipping byte-identical receipt %s', r.get('filename'))
+                continue
            if h:
-                if h in seen_hashes:
-                    logger.info('expenses_agent: skipping duplicate receipt %s', r.get('filename'))
-                    actions.append(f"Skipped duplicate: {r.get('filename', 'receipt')}")
-                    continue
                seen_hashes.add(h)
            unique_receipts.append(r)

@@ -154,30 +159,46 @@ class ExpensesAgent(BaseAgent):
                               receipt.get('filename'), parsed)
                parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
                          'date': receipt.get('date_from_name') or _date.today().isoformat(),
-                          'product_name': ''}
+                          'time': None, 'product_name': ''}
            paired.append((receipt, parsed))

-        # Semantic dedup — different photos of the same physical receipt share
-        # the same amount, date, and a similar vendor name.
+        # Pass 2: semantic dedup — detect multiple photos of the same receipt
        deduped: list[tuple[dict, dict]] = []
+        dup_pairs: list[tuple[int, dict, dict]] = []  # (kept_idx, dup_receipt, dup_parsed)
        for receipt, parsed in paired:
            dup_idx = self._find_semantic_duplicate(parsed, deduped)
            if dup_idx is not None:
-                # Keep whichever photo produced more OCR text (clearer shot)
-                existing_receipt, _ = deduped[dup_idx]
-                if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
+                dup_pairs.append((dup_idx, receipt, parsed))
+                # Tentatively keep whichever photo had more OCR text
+                if len(receipt.get('text', '')) > len(deduped[dup_idx][0].get('text', '')):
                    deduped[dup_idx] = (receipt, parsed)
-                actions.append(
-                    f"Skipped duplicate photo of "
-                    f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
-                    f" ${float(parsed.get('amount', 0)):.2f}"
-                )
-                logger.info('expenses_agent: semantic duplicate %s skipped',
-                            receipt.get('filename'))
            else:
                deduped.append((receipt, parsed))

-        for receipt, parsed in deduped:
+        # If duplicates were found and user hasn't decided yet, pause and ask
+        if dup_pairs and user_dup_decision == 'none':
+            self._gathered_data['mode'] = 'awaiting_dup_approval'
+            self._pending_dup_pairs = dup_pairs
+            self._deduped = deduped
+            return []
+
+        # Apply user's decision
+        if user_dup_decision == 'keep_all':
+            final_list = paired
+        else:
+            final_list = deduped  # default: skip semantic duplicates
+
+        # Create the sheet now that we know what to include
+        sheet_name = f'Expense Report - {_date.today().isoformat()}'
+        sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
+        if not sheet_result.success:
+            self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
+            return []
+
+        sheet_id = sheet_result.record_id
+        actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
+
+        for receipt, parsed in final_list:

            # Pick product by name match returned from LLM, fall back to default
            product_id = default_product_id
@@ -228,12 +249,14 @@ class ExpensesAgent(BaseAgent):
        Match criteria (all must pass):
          1. Same date
          2. Amount > 0 and within $0.05 of each other
-          3. Vendor name similarity >= 60 %  (or both vendors are raw filenames)
+          3. Transaction times within 30 min of each other (if both present);
+             times > 30 min apart rule out a duplicate
+          4. Vendor name similarity >= 60 %  (or both vendors are raw filenames)
        """
        amt = float(parsed.get('amount', 0))
        date = parsed.get('date', '')
+        time = parsed.get('time')  # HH:MM or None
        vendor = str(parsed.get('vendor', '')).lower().strip()
-        # If OCR failed the vendor is just a filename — can't dedup by content
        is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))

        for idx, (_, other) in enumerate(candidates):
@@ -245,6 +268,17 @@ class ExpensesAgent(BaseAgent):
                continue
            if date != other.get('date', ''):
                continue
+            # Time check: if both receipts have a transaction time and they are
+            # more than 30 minutes apart they are different transactions.
+            other_time = other.get('time')
+            if time and other_time:
+                try:
+                    h1, m1 = (int(p) for p in time.split(':')[:2])
+                    h2, m2 = (int(p) for p in other_time.split(':')[:2])
+                    if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30:
+                        continue
+                except Exception:
+                    pass  # unparseable time — ignore the signal
            other_vendor = str(other.get('vendor', '')).lower().strip()
            other_is_filename = other_vendor.endswith(
                ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
@@ -261,7 +295,7 @@ class ExpensesAgent(BaseAgent):
                                   date_hint: str = None) -> dict:
        today = _date.today().isoformat()
        fallback = {'vendor': filename, 'amount': 0.0,
-                    'date': date_hint or today, 'product_name': ''}
+                    'date': date_hint or today, 'time': None, 'product_name': ''}
        ocr_failed = not text or text.startswith('[')

        product_list = ''
@@ -289,6 +323,8 @@ class ExpensesAgent(BaseAgent):
                'do NOT use subtotal, tax, or tip separately; '
                'if multiple totals appear pick the largest one labeled as the final total),\n'
                f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n'
+                '"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
+                'null if not present),\n'
                f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
                f'Receipt text:\n{text[:2000]}\n\nJSON only:'
            )
@@ -305,6 +341,7 @@ class ExpensesAgent(BaseAgent):
                    'vendor': str(data.get('vendor', filename)),
                    'amount': float(data.get('amount', 0.0)),
                    'date': str(data.get('date') or date_hint or today),
+                    'time': data.get('time') or None,
                    'product_name': str(data.get('product_name', '')),
                }
        except Exception as exc:
@@ -315,12 +352,39 @@ class ExpensesAgent(BaseAgent):
        data = self._gathered_data
        directive_id = self._directive.directive_id if self._directive else ''

+        if data.get('mode') == 'awaiting_dup_approval':
+            dup_pairs = getattr(self, '_pending_dup_pairs', [])
+            deduped = getattr(self, '_deduped', [])
+            lines = [f'I found {len(dup_pairs)} suspected duplicate receipt photo(s). '
+                     f'Please review before I create the expense report:\n']
+            for kept_idx, dup_receipt, dup_parsed in dup_pairs:
+                kept_receipt, kept_parsed = deduped[kept_idx]
+                vendor = (dup_parsed.get('vendor') or kept_parsed.get('vendor', 'Unknown'))
+                amount = float(dup_parsed.get('amount', 0))
+                dt = dup_parsed.get('date', '')
+                time_a = kept_parsed.get('time') or ''
+                time_b = dup_parsed.get('time') or ''
+                line = f'• {vendor}  ${amount:.2f}  on {dt}'
+                if time_a or time_b:
+                    line += f'  (Photo A at {time_a or "?"}, Photo B at {time_b or "?"})'
+                line += (f'\n  Photo A: {kept_receipt.get("filename", "?")}'
+                         f'\n  Photo B: {dup_receipt.get("filename", "?")}')
+                lines.append(line)
+            lines.append(
+                '\nReply "skip duplicates" to keep the clearest photo of each, '
+                'or "keep all" to include every photo as a separate expense.'
+            )
+            return AgentReport(
+                directive_id=directive_id, agent=self.name, status='complete',
+                summary='\n'.join(lines), data=data,
+                escalations=[], actions_taken=[])
+
        if data.get('mode') == 'create_from_receipts':
            if self._actions_taken:
                lines = '\n'.join(f'  • {a}' for a in self._actions_taken)
                summary = (
                    f'Expense report created successfully:\n{lines}\n\n'
-                    'The report is in draft. Please open Odoo > Expenses, '
+                    'The report is in draft. Please open Odoo › Expenses, '
                    'review the entries, and click Submit to send for approval.'
                )
                status = 'complete'