From 462f63d11d64a7af25563953c2310e397e44489a Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Sat, 16 May 2026 02:07:37 -0400 Subject: [PATCH] Add duplicate approval flow with time-based dedup - expenses_agent: extract transaction time (HH:MM) from OCR receipt text - expenses_agent: _find_semantic_duplicate uses time to rule out false positives (>30 min apart = different receipts) - expenses_agent: pause when duplicates found, set mode=awaiting_dup_approval, ask user before creating sheet - expenses_agent: _report formats approval message listing each dup pair with vendor/amount/date/times/filenames - ab_ai_mail: _find_pending_attachments recognises dup-approval bot message so ZIP re-attaches on user reply Co-Authored-By: Claude Sonnet 4.6 --- addons/activeblue_ai/models/ab_ai_mail.py | 10 +- agent_service/agents/expenses_agent.py | 130 ++++++++++++++++------ 2 files changed, 105 insertions(+), 35 deletions(-) diff --git a/addons/activeblue_ai/models/ab_ai_mail.py b/addons/activeblue_ai/models/ab_ai_mail.py index acee401..382a0c9 100644 --- a/addons/activeblue_ai/models/ab_ai_mail.py +++ b/addons/activeblue_ai/models/ab_ai_mail.py @@ -191,12 +191,18 @@ class DiscussChannel(models.Model): (i.e. the bot hasn't already acted on those files). """ messages = self.message_ids.sorted('date', reverse=True)[:_LOOKBACK_MESSAGES] + _bot_question_phrases = ( + 'what would you like me to do', + 'suspected duplicate', + 'skip duplicates', + 'keep all', + ) prev_was_bot_question = False for msg in messages: is_bot = msg.author_id == bot_partner if is_bot: - # Check whether this bot message was a clarification question - if 'what would you like me to do' in (msg.body or '').lower(): + body_lower = (msg.body or '').lower() + if any(p in body_lower for p in _bot_question_phrases): prev_was_bot_question = True continue # Human message diff --git a/agent_service/agents/expenses_agent.py b/agent_service/agents/expenses_agent.py index a3efeff..0427ed4 100644 --- a/agent_service/agents/expenses_agent.py +++ b/agent_service/agents/expenses_agent.py @@ -57,8 +57,20 @@ class ExpensesAgent(BaseAgent): async def _plan(self) -> dict: task = (self._directive.task if self._directive else '').lower() receipts = getattr(self._directive.context, 'receipts', []) if self._directive else [] + + # Detect whether the user is responding to a duplicate-approval request + skip_keywords = ('skip', 'yes', 'remove duplicate', 'exclude duplicate', 'drop duplicate') + keep_keywords = ('keep all', 'keep both', 'include all', 'no skip', "don't skip") + if any(k in task for k in skip_keywords): + user_dup_decision = 'skip' + elif any(k in task for k in keep_keywords): + user_dup_decision = 'keep_all' + else: + user_dup_decision = 'none' # first time through — will ask if dups found + return { 'mode': 'create_from_receipts' if receipts else 'read', + 'user_dup_decision': user_dup_decision, 'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts, 'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts, 'employee_id': self._directive.params.get('employee_id') if self._directive else None, @@ -67,7 +79,8 @@ class ExpensesAgent(BaseAgent): } async def _gather(self, plan: dict) -> dict: - data: dict = {'mode': plan.get('mode', 'read')} + data: dict = {'mode': plan.get('mode', 'read'), + 'user_dup_decision': plan.get('user_dup_decision', 'none')} if plan.get('mode') == 'create_from_receipts': self._gathered_data = data return data @@ -100,6 +113,8 @@ class ExpensesAgent(BaseAgent): if not receipts: return [] + user_dup_decision = self._gathered_data.get('user_dup_decision', 'none') + user_id = (self._directive.context.peer_data.get('requesting_user_id') if self._directive else None) employee_id = await self._et.get_employee_id_for_user(user_id) @@ -108,30 +123,20 @@ class ExpensesAgent(BaseAgent): 'No employee record found for the current user; cannot create expense report.') return [] - sheet_name = f'Expense Report - {_date.today().isoformat()}' - sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id) - if not sheet_result.success: - self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}') - return [] - - sheet_id = sheet_result.record_id - actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})'] - # Fetch all expensable products once for category selection expense_products = await self._et.get_expense_products() default_product_id = expense_products[0]['id'] if expense_products else None product_map = {p['id']: p['name'] for p in expense_products} - # Deduplicate receipts by SHA256 hash — same image uploaded twice + # Pass 1: byte-exact dedup (same file uploaded twice) seen_hashes: set = set() unique_receipts = [] for r in receipts: h = r.get('sha256') + if h and h in seen_hashes: + logger.info('expenses_agent: skipping byte-identical receipt %s', r.get('filename')) + continue if h: - if h in seen_hashes: - logger.info('expenses_agent: skipping duplicate receipt %s', r.get('filename')) - actions.append(f"Skipped duplicate: {r.get('filename', 'receipt')}") - continue seen_hashes.add(h) unique_receipts.append(r) @@ -154,30 +159,46 @@ class ExpensesAgent(BaseAgent): receipt.get('filename'), parsed) parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0, 'date': receipt.get('date_from_name') or _date.today().isoformat(), - 'product_name': ''} + 'time': None, 'product_name': ''} paired.append((receipt, parsed)) - # Semantic dedup — different photos of the same physical receipt share - # the same amount, date, and a similar vendor name. + # Pass 2: semantic dedup — detect multiple photos of the same receipt deduped: list[tuple[dict, dict]] = [] + dup_pairs: list[tuple[int, dict, dict]] = [] # (kept_idx, dup_receipt, dup_parsed) for receipt, parsed in paired: dup_idx = self._find_semantic_duplicate(parsed, deduped) if dup_idx is not None: - # Keep whichever photo produced more OCR text (clearer shot) - existing_receipt, _ = deduped[dup_idx] - if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')): + dup_pairs.append((dup_idx, receipt, parsed)) + # Tentatively keep whichever photo had more OCR text + if len(receipt.get('text', '')) > len(deduped[dup_idx][0].get('text', '')): deduped[dup_idx] = (receipt, parsed) - actions.append( - f"Skipped duplicate photo of " - f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}" - f" ${float(parsed.get('amount', 0)):.2f}" - ) - logger.info('expenses_agent: semantic duplicate %s skipped', - receipt.get('filename')) else: deduped.append((receipt, parsed)) - for receipt, parsed in deduped: + # If duplicates were found and user hasn't decided yet, pause and ask + if dup_pairs and user_dup_decision == 'none': + self._gathered_data['mode'] = 'awaiting_dup_approval' + self._pending_dup_pairs = dup_pairs + self._deduped = deduped + return [] + + # Apply user's decision + if user_dup_decision == 'keep_all': + final_list = paired + else: + final_list = deduped # default: skip semantic duplicates + + # Create the sheet now that we know what to include + sheet_name = f'Expense Report - {_date.today().isoformat()}' + sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id) + if not sheet_result.success: + self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}') + return [] + + sheet_id = sheet_result.record_id + actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})'] + + for receipt, parsed in final_list: # Pick product by name match returned from LLM, fall back to default product_id = default_product_id @@ -228,12 +249,14 @@ class ExpensesAgent(BaseAgent): Match criteria (all must pass): 1. Same date 2. Amount > 0 and within $0.05 of each other - 3. Vendor name similarity >= 60 % (or both vendors are raw filenames) + 3. Transaction times within 30 min of each other (if both present); + times > 30 min apart rule out a duplicate + 4. Vendor name similarity >= 60 % (or both vendors are raw filenames) """ amt = float(parsed.get('amount', 0)) date = parsed.get('date', '') + time = parsed.get('time') # HH:MM or None vendor = str(parsed.get('vendor', '')).lower().strip() - # If OCR failed the vendor is just a filename — can't dedup by content is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) for idx, (_, other) in enumerate(candidates): @@ -245,6 +268,17 @@ class ExpensesAgent(BaseAgent): continue if date != other.get('date', ''): continue + # Time check: if both receipts have a transaction time and they are + # more than 30 minutes apart they are different transactions. + other_time = other.get('time') + if time and other_time: + try: + h1, m1 = (int(p) for p in time.split(':')[:2]) + h2, m2 = (int(p) for p in other_time.split(':')[:2]) + if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30: + continue + except Exception: + pass # unparseable time — ignore the signal other_vendor = str(other.get('vendor', '')).lower().strip() other_is_filename = other_vendor.endswith( ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) @@ -261,7 +295,7 @@ class ExpensesAgent(BaseAgent): date_hint: str = None) -> dict: today = _date.today().isoformat() fallback = {'vendor': filename, 'amount': 0.0, - 'date': date_hint or today, 'product_name': ''} + 'date': date_hint or today, 'time': None, 'product_name': ''} ocr_failed = not text or text.startswith('[') product_list = '' @@ -289,6 +323,8 @@ class ExpensesAgent(BaseAgent): 'do NOT use subtotal, tax, or tip separately; ' 'if multiple totals appear pick the largest one labeled as the final total),\n' f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n' + '"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; ' + 'null if not present),\n' f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n' f'Receipt text:\n{text[:2000]}\n\nJSON only:' ) @@ -305,6 +341,7 @@ class ExpensesAgent(BaseAgent): 'vendor': str(data.get('vendor', filename)), 'amount': float(data.get('amount', 0.0)), 'date': str(data.get('date') or date_hint or today), + 'time': data.get('time') or None, 'product_name': str(data.get('product_name', '')), } except Exception as exc: @@ -315,12 +352,39 @@ class ExpensesAgent(BaseAgent): data = self._gathered_data directive_id = self._directive.directive_id if self._directive else '' + if data.get('mode') == 'awaiting_dup_approval': + dup_pairs = getattr(self, '_pending_dup_pairs', []) + deduped = getattr(self, '_deduped', []) + lines = [f'I found {len(dup_pairs)} suspected duplicate receipt photo(s). ' + f'Please review before I create the expense report:\n'] + for kept_idx, dup_receipt, dup_parsed in dup_pairs: + kept_receipt, kept_parsed = deduped[kept_idx] + vendor = (dup_parsed.get('vendor') or kept_parsed.get('vendor', 'Unknown')) + amount = float(dup_parsed.get('amount', 0)) + dt = dup_parsed.get('date', '') + time_a = kept_parsed.get('time') or '' + time_b = dup_parsed.get('time') or '' + line = f'• {vendor} ${amount:.2f} on {dt}' + if time_a or time_b: + line += f' (Photo A at {time_a or "?"}, Photo B at {time_b or "?"})' + line += (f'\n Photo A: {kept_receipt.get("filename", "?")}' + f'\n Photo B: {dup_receipt.get("filename", "?")}') + lines.append(line) + lines.append( + '\nReply "skip duplicates" to keep the clearest photo of each, ' + 'or "keep all" to include every photo as a separate expense.' + ) + return AgentReport( + directive_id=directive_id, agent=self.name, status='complete', + summary='\n'.join(lines), data=data, + escalations=[], actions_taken=[]) + if data.get('mode') == 'create_from_receipts': if self._actions_taken: lines = '\n'.join(f' • {a}' for a in self._actions_taken) summary = ( f'Expense report created successfully:\n{lines}\n\n' - 'The report is in draft. Please open Odoo > Expenses, ' + 'The report is in draft. Please open Odoo › Expenses, ' 'review the entries, and click Submit to send for approval.' ) status = 'complete'