From 462f63d11d64a7af25563953c2310e397e44489a Mon Sep 17 00:00:00 2001
From: Carlos Garcia <tocmo@DESKTOP-O6U0UOS.AVC.local>
Date: Sat, 16 May 2026 02:07:37 -0400
Subject: [PATCH] Add duplicate approval flow with time-based dedup

- expenses_agent: extract transaction time (HH:MM) from OCR receipt text
- expenses_agent: _find_semantic_duplicate uses time to rule out false positives (>30 min apart = different receipts)
- expenses_agent: pause when duplicates found, set mode=awaiting_dup_approval, ask user before creating sheet
- expenses_agent: _report formats approval message listing each dup pair with vendor/amount/date/times/filenames
- ab_ai_mail: _find_pending_attachments recognises dup-approval bot message so ZIP re-attaches on user reply

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 addons/activeblue_ai/models/ab_ai_mail.py |  10 +-
 agent_service/agents/expenses_agent.py    | 130 ++++++++++++++++------
 2 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/addons/activeblue_ai/models/ab_ai_mail.py b/addons/activeblue_ai/models/ab_ai_mail.py
index acee401..382a0c9 100644
--- a/addons/activeblue_ai/models/ab_ai_mail.py
+++ b/addons/activeblue_ai/models/ab_ai_mail.py
@@ -191,12 +191,18 @@ class DiscussChannel(models.Model):
         (i.e. the bot hasn't already acted on those files).
         """
         messages = self.message_ids.sorted('date', reverse=True)[:_LOOKBACK_MESSAGES]
+        _bot_question_phrases = (
+            'what would you like me to do',
+            'suspected duplicate',
+            'skip duplicates',
+            'keep all',
+        )
         prev_was_bot_question = False
         for msg in messages:
             is_bot = msg.author_id == bot_partner
             if is_bot:
-                # Check whether this bot message was a clarification question
-                if 'what would you like me to do' in (msg.body or '').lower():
+                body_lower = (msg.body or '').lower()
+                if any(p in body_lower for p in _bot_question_phrases):
                     prev_was_bot_question = True
                 continue
             # Human message
diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py
index a3efeff..0427ed4 100644
--- a/agent_service/agents/expenses_agent.py
+++ b/agent_service/agents/expenses_agent.py
@@ -57,8 +57,20 @@ class ExpensesAgent(BaseAgent):
     async def _plan(self) -> dict:
         task = (self._directive.task if self._directive else '').lower()
         receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
+
+        # Detect whether the user is responding to a duplicate-approval request
+        skip_keywords = ('skip', 'yes', 'remove duplicate', 'exclude duplicate', 'drop duplicate')
+        keep_keywords = ('keep all', 'keep both', 'include all', 'no skip', "don't skip")
+        if any(k in task for k in skip_keywords):
+            user_dup_decision = 'skip'
+        elif any(k in task for k in keep_keywords):
+            user_dup_decision = 'keep_all'
+        else:
+            user_dup_decision = 'none'  # first time through — will ask if dups found
+
         return {
             'mode': 'create_from_receipts' if receipts else 'read',
+            'user_dup_decision': user_dup_decision,
             'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
             'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
             'employee_id': self._directive.params.get('employee_id') if self._directive else None,
@@ -67,7 +79,8 @@ class ExpensesAgent(BaseAgent):
         }
 
     async def _gather(self, plan: dict) -> dict:
-        data: dict = {'mode': plan.get('mode', 'read')}
+        data: dict = {'mode': plan.get('mode', 'read'),
+                      'user_dup_decision': plan.get('user_dup_decision', 'none')}
         if plan.get('mode') == 'create_from_receipts':
             self._gathered_data = data
             return data
@@ -100,6 +113,8 @@ class ExpensesAgent(BaseAgent):
         if not receipts:
             return []
 
+        user_dup_decision = self._gathered_data.get('user_dup_decision', 'none')
+
         user_id = (self._directive.context.peer_data.get('requesting_user_id')
                    if self._directive else None)
         employee_id = await self._et.get_employee_id_for_user(user_id)
@@ -108,30 +123,20 @@ class ExpensesAgent(BaseAgent):
                 'No employee record found for the current user; cannot create expense report.')
             return []
 
-        sheet_name = f'Expense Report - {_date.today().isoformat()}'
-        sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
-        if not sheet_result.success:
-            self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
-            return []
-
-        sheet_id = sheet_result.record_id
-        actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
-
         # Fetch all expensable products once for category selection
         expense_products = await self._et.get_expense_products()
         default_product_id = expense_products[0]['id'] if expense_products else None
         product_map = {p['id']: p['name'] for p in expense_products}
 
-        # Deduplicate receipts by SHA256 hash — same image uploaded twice
+        # Pass 1: byte-exact dedup (same file uploaded twice)
         seen_hashes: set = set()
         unique_receipts = []
         for r in receipts:
             h = r.get('sha256')
+            if h and h in seen_hashes:
+                logger.info('expenses_agent: skipping byte-identical receipt %s', r.get('filename'))
+                continue
             if h:
-                if h in seen_hashes:
-                    logger.info('expenses_agent: skipping duplicate receipt %s', r.get('filename'))
-                    actions.append(f"Skipped duplicate: {r.get('filename', 'receipt')}")
-                    continue
                 seen_hashes.add(h)
             unique_receipts.append(r)
 
@@ -154,30 +159,46 @@ class ExpensesAgent(BaseAgent):
                                receipt.get('filename'), parsed)
                 parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
                           'date': receipt.get('date_from_name') or _date.today().isoformat(),
-                          'product_name': ''}
+                          'time': None, 'product_name': ''}
             paired.append((receipt, parsed))
 
-        # Semantic dedup — different photos of the same physical receipt share
-        # the same amount, date, and a similar vendor name.
+        # Pass 2: semantic dedup — detect multiple photos of the same receipt
         deduped: list[tuple[dict, dict]] = []
+        dup_pairs: list[tuple[int, dict, dict]] = []  # (kept_idx, dup_receipt, dup_parsed)
         for receipt, parsed in paired:
             dup_idx = self._find_semantic_duplicate(parsed, deduped)
             if dup_idx is not None:
-                # Keep whichever photo produced more OCR text (clearer shot)
-                existing_receipt, _ = deduped[dup_idx]
-                if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
+                dup_pairs.append((dup_idx, receipt, parsed))
+                # Tentatively keep whichever photo had more OCR text
+                if len(receipt.get('text', '')) > len(deduped[dup_idx][0].get('text', '')):
                     deduped[dup_idx] = (receipt, parsed)
-                actions.append(
-                    f"Skipped duplicate photo of "
-                    f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
-                    f" ${float(parsed.get('amount', 0)):.2f}"
-                )
-                logger.info('expenses_agent: semantic duplicate %s skipped',
-                            receipt.get('filename'))
             else:
                 deduped.append((receipt, parsed))
 
-        for receipt, parsed in deduped:
+        # If duplicates were found and user hasn't decided yet, pause and ask
+        if dup_pairs and user_dup_decision == 'none':
+            self._gathered_data['mode'] = 'awaiting_dup_approval'
+            self._pending_dup_pairs = dup_pairs
+            self._deduped = deduped
+            return []
+
+        # Apply user's decision
+        if user_dup_decision == 'keep_all':
+            final_list = paired
+        else:
+            final_list = deduped  # default: skip semantic duplicates
+
+        # Create the sheet now that we know what to include
+        sheet_name = f'Expense Report - {_date.today().isoformat()}'
+        sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
+        if not sheet_result.success:
+            self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
+            return []
+
+        sheet_id = sheet_result.record_id
+        actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
+
+        for receipt, parsed in final_list:
 
             # Pick product by name match returned from LLM, fall back to default
             product_id = default_product_id
@@ -228,12 +249,14 @@ class ExpensesAgent(BaseAgent):
         Match criteria (all must pass):
           1. Same date
           2. Amount > 0 and within $0.05 of each other
-          3. Vendor name similarity >= 60 %  (or both vendors are raw filenames)
+          3. Transaction times within 30 min of each other (if both present);
+             times > 30 min apart rule out a duplicate
+          4. Vendor name similarity >= 60 %  (or both vendors are raw filenames)
         """
         amt = float(parsed.get('amount', 0))
         date = parsed.get('date', '')
+        time = parsed.get('time')  # HH:MM or None
         vendor = str(parsed.get('vendor', '')).lower().strip()
-        # If OCR failed the vendor is just a filename — can't dedup by content
         is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
 
         for idx, (_, other) in enumerate(candidates):
@@ -245,6 +268,17 @@ class ExpensesAgent(BaseAgent):
                 continue
             if date != other.get('date', ''):
                 continue
+            # Time check: if both receipts have a transaction time and they are
+            # more than 30 minutes apart they are different transactions.
+            other_time = other.get('time')
+            if time and other_time:
+                try:
+                    h1, m1 = (int(p) for p in time.split(':')[:2])
+                    h2, m2 = (int(p) for p in other_time.split(':')[:2])
+                    if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30:
+                        continue
+                except Exception:
+                    pass  # unparseable time — ignore the signal
             other_vendor = str(other.get('vendor', '')).lower().strip()
             other_is_filename = other_vendor.endswith(
                 ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
@@ -261,7 +295,7 @@ class ExpensesAgent(BaseAgent):
                                    date_hint: str = None) -> dict:
         today = _date.today().isoformat()
         fallback = {'vendor': filename, 'amount': 0.0,
-                    'date': date_hint or today, 'product_name': ''}
+                    'date': date_hint or today, 'time': None, 'product_name': ''}
         ocr_failed = not text or text.startswith('[')
 
         product_list = ''
@@ -289,6 +323,8 @@ class ExpensesAgent(BaseAgent):
                 'do NOT use subtotal, tax, or tip separately; '
                 'if multiple totals appear pick the largest one labeled as the final total),\n'
                 f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n'
+                '"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
+                'null if not present),\n'
                 f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
                 f'Receipt text:\n{text[:2000]}\n\nJSON only:'
             )
@@ -305,6 +341,7 @@ class ExpensesAgent(BaseAgent):
                     'vendor': str(data.get('vendor', filename)),
                     'amount': float(data.get('amount', 0.0)),
                     'date': str(data.get('date') or date_hint or today),
+                    'time': data.get('time') or None,
                     'product_name': str(data.get('product_name', '')),
                 }
         except Exception as exc:
@@ -315,12 +352,39 @@ class ExpensesAgent(BaseAgent):
         data = self._gathered_data
         directive_id = self._directive.directive_id if self._directive else ''
 
+        if data.get('mode') == 'awaiting_dup_approval':
+            dup_pairs = getattr(self, '_pending_dup_pairs', [])
+            deduped = getattr(self, '_deduped', [])
+            lines = [f'I found {len(dup_pairs)} suspected duplicate receipt photo(s). '
+                     f'Please review before I create the expense report:\n']
+            for kept_idx, dup_receipt, dup_parsed in dup_pairs:
+                kept_receipt, kept_parsed = deduped[kept_idx]
+                vendor = (dup_parsed.get('vendor') or kept_parsed.get('vendor', 'Unknown'))
+                amount = float(dup_parsed.get('amount', 0))
+                dt = dup_parsed.get('date', '')
+                time_a = kept_parsed.get('time') or ''
+                time_b = dup_parsed.get('time') or ''
+                line = f'• {vendor}  ${amount:.2f}  on {dt}'
+                if time_a or time_b:
+                    line += f'  (Photo A at {time_a or "?"}, Photo B at {time_b or "?"})'
+                line += (f'\n  Photo A: {kept_receipt.get("filename", "?")}'
+                         f'\n  Photo B: {dup_receipt.get("filename", "?")}')
+                lines.append(line)
+            lines.append(
+                '\nReply "skip duplicates" to keep the clearest photo of each, '
+                'or "keep all" to include every photo as a separate expense.'
+            )
+            return AgentReport(
+                directive_id=directive_id, agent=self.name, status='complete',
+                summary='\n'.join(lines), data=data,
+                escalations=[], actions_taken=[])
+
         if data.get('mode') == 'create_from_receipts':
             if self._actions_taken:
                 lines = '\n'.join(f'  • {a}' for a in self._actions_taken)
                 summary = (
                     f'Expense report created successfully:\n{lines}\n\n'
-                    'The report is in draft. Please open Odoo > Expenses, '
+                    'The report is in draft. Please open Odoo › Expenses, '
                     'review the entries, and click Submit to send for approval.'
                 )
                 status = 'complete'