Add duplicate approval flow with time-based dedup

- expenses_agent: extract transaction time (HH:MM) from OCR receipt text
- expenses_agent: _find_semantic_duplicate uses time to rule out false positives (>30 min apart = different receipts)
- expenses_agent: pause when duplicates found, set mode=awaiting_dup_approval, ask user before creating sheet
- expenses_agent: _report formats approval message listing each dup pair with vendor/amount/date/times/filenames
- ab_ai_mail: _find_pending_attachments recognises dup-approval bot message so ZIP re-attaches on user reply

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-16 02:07:37 -04:00
parent f90a2ee863
commit 462f63d11d
2 changed files with 105 additions and 35 deletions

View File

@@ -191,12 +191,18 @@ class DiscussChannel(models.Model):
(i.e. the bot hasn't already acted on those files).
"""
messages = self.message_ids.sorted('date', reverse=True)[:_LOOKBACK_MESSAGES]
_bot_question_phrases = (
'what would you like me to do',
'suspected duplicate',
'skip duplicates',
'keep all',
)
prev_was_bot_question = False
for msg in messages:
is_bot = msg.author_id == bot_partner
if is_bot:
# Check whether this bot message was a clarification question
if 'what would you like me to do' in (msg.body or '').lower():
body_lower = (msg.body or '').lower()
if any(p in body_lower for p in _bot_question_phrases):
prev_was_bot_question = True
continue
# Human message

View File

@@ -57,8 +57,20 @@ class ExpensesAgent(BaseAgent):
async def _plan(self) -> dict:
task = (self._directive.task if self._directive else '').lower()
receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
# Detect whether the user is responding to a duplicate-approval request
skip_keywords = ('skip', 'yes', 'remove duplicate', 'exclude duplicate', 'drop duplicate')
keep_keywords = ('keep all', 'keep both', 'include all', 'no skip', "don't skip")
if any(k in task for k in skip_keywords):
user_dup_decision = 'skip'
elif any(k in task for k in keep_keywords):
user_dup_decision = 'keep_all'
else:
user_dup_decision = 'none' # first time through — will ask if dups found
return {
'mode': 'create_from_receipts' if receipts else 'read',
'user_dup_decision': user_dup_decision,
'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
'employee_id': self._directive.params.get('employee_id') if self._directive else None,
@@ -67,7 +79,8 @@ class ExpensesAgent(BaseAgent):
}
async def _gather(self, plan: dict) -> dict:
data: dict = {'mode': plan.get('mode', 'read')}
data: dict = {'mode': plan.get('mode', 'read'),
'user_dup_decision': plan.get('user_dup_decision', 'none')}
if plan.get('mode') == 'create_from_receipts':
self._gathered_data = data
return data
@@ -100,6 +113,8 @@ class ExpensesAgent(BaseAgent):
if not receipts:
return []
user_dup_decision = self._gathered_data.get('user_dup_decision', 'none')
user_id = (self._directive.context.peer_data.get('requesting_user_id')
if self._directive else None)
employee_id = await self._et.get_employee_id_for_user(user_id)
@@ -108,30 +123,20 @@ class ExpensesAgent(BaseAgent):
'No employee record found for the current user; cannot create expense report.')
return []
sheet_name = f'Expense Report - {_date.today().isoformat()}'
sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
if not sheet_result.success:
self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
return []
sheet_id = sheet_result.record_id
actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
# Fetch all expensable products once for category selection
expense_products = await self._et.get_expense_products()
default_product_id = expense_products[0]['id'] if expense_products else None
product_map = {p['id']: p['name'] for p in expense_products}
# Deduplicate receipts by SHA256 hash — same image uploaded twice
# Pass 1: byte-exact dedup (same file uploaded twice)
seen_hashes: set = set()
unique_receipts = []
for r in receipts:
h = r.get('sha256')
if h and h in seen_hashes:
logger.info('expenses_agent: skipping byte-identical receipt %s', r.get('filename'))
continue
if h:
if h in seen_hashes:
logger.info('expenses_agent: skipping duplicate receipt %s', r.get('filename'))
actions.append(f"Skipped duplicate: {r.get('filename', 'receipt')}")
continue
seen_hashes.add(h)
unique_receipts.append(r)
@@ -154,30 +159,46 @@ class ExpensesAgent(BaseAgent):
receipt.get('filename'), parsed)
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
'date': receipt.get('date_from_name') or _date.today().isoformat(),
'product_name': ''}
'time': None, 'product_name': ''}
paired.append((receipt, parsed))
# Semantic dedup — different photos of the same physical receipt share
# the same amount, date, and a similar vendor name.
# Pass 2: semantic dedup — detect multiple photos of the same receipt
deduped: list[tuple[dict, dict]] = []
dup_pairs: list[tuple[int, dict, dict]] = [] # (kept_idx, dup_receipt, dup_parsed)
for receipt, parsed in paired:
dup_idx = self._find_semantic_duplicate(parsed, deduped)
if dup_idx is not None:
# Keep whichever photo produced more OCR text (clearer shot)
existing_receipt, _ = deduped[dup_idx]
if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
dup_pairs.append((dup_idx, receipt, parsed))
# Tentatively keep whichever photo had more OCR text
if len(receipt.get('text', '')) > len(deduped[dup_idx][0].get('text', '')):
deduped[dup_idx] = (receipt, parsed)
actions.append(
f"Skipped duplicate photo of "
f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
f" ${float(parsed.get('amount', 0)):.2f}"
)
logger.info('expenses_agent: semantic duplicate %s skipped',
receipt.get('filename'))
else:
deduped.append((receipt, parsed))
for receipt, parsed in deduped:
# If duplicates were found and user hasn't decided yet, pause and ask
if dup_pairs and user_dup_decision == 'none':
self._gathered_data['mode'] = 'awaiting_dup_approval'
self._pending_dup_pairs = dup_pairs
self._deduped = deduped
return []
# Apply user's decision
if user_dup_decision == 'keep_all':
final_list = paired
else:
final_list = deduped # default: skip semantic duplicates
# Create the sheet now that we know what to include
sheet_name = f'Expense Report - {_date.today().isoformat()}'
sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
if not sheet_result.success:
self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
return []
sheet_id = sheet_result.record_id
actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
for receipt, parsed in final_list:
# Pick product by name match returned from LLM, fall back to default
product_id = default_product_id
@@ -228,12 +249,14 @@ class ExpensesAgent(BaseAgent):
Match criteria (all must pass):
1. Same date
2. Amount > 0 and within $0.05 of each other
3. Vendor name similarity >= 60 % (or both vendors are raw filenames)
3. Transaction times within 30 min of each other (if both present);
times > 30 min apart rule out a duplicate
4. Vendor name similarity >= 60 % (or both vendors are raw filenames)
"""
amt = float(parsed.get('amount', 0))
date = parsed.get('date', '')
time = parsed.get('time') # HH:MM or None
vendor = str(parsed.get('vendor', '')).lower().strip()
# If OCR failed the vendor is just a filename — can't dedup by content
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
for idx, (_, other) in enumerate(candidates):
@@ -245,6 +268,17 @@ class ExpensesAgent(BaseAgent):
continue
if date != other.get('date', ''):
continue
# Time check: if both receipts have a transaction time and they are
# more than 30 minutes apart they are different transactions.
other_time = other.get('time')
if time and other_time:
try:
h1, m1 = (int(p) for p in time.split(':')[:2])
h2, m2 = (int(p) for p in other_time.split(':')[:2])
if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30:
continue
except Exception:
pass # unparseable time — ignore the signal
other_vendor = str(other.get('vendor', '')).lower().strip()
other_is_filename = other_vendor.endswith(
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
@@ -261,7 +295,7 @@ class ExpensesAgent(BaseAgent):
date_hint: str = None) -> dict:
today = _date.today().isoformat()
fallback = {'vendor': filename, 'amount': 0.0,
'date': date_hint or today, 'product_name': ''}
'date': date_hint or today, 'time': None, 'product_name': ''}
ocr_failed = not text or text.startswith('[')
product_list = ''
@@ -289,6 +323,8 @@ class ExpensesAgent(BaseAgent):
'do NOT use subtotal, tax, or tip separately; '
'if multiple totals appear pick the largest one labeled as the final total),\n'
f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n'
'"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
'null if not present),\n'
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
f'Receipt text:\n{text[:2000]}\n\nJSON only:'
)
@@ -305,6 +341,7 @@ class ExpensesAgent(BaseAgent):
'vendor': str(data.get('vendor', filename)),
'amount': float(data.get('amount', 0.0)),
'date': str(data.get('date') or date_hint or today),
'time': data.get('time') or None,
'product_name': str(data.get('product_name', '')),
}
except Exception as exc:
@@ -315,12 +352,39 @@ class ExpensesAgent(BaseAgent):
data = self._gathered_data
directive_id = self._directive.directive_id if self._directive else ''
if data.get('mode') == 'awaiting_dup_approval':
dup_pairs = getattr(self, '_pending_dup_pairs', [])
deduped = getattr(self, '_deduped', [])
lines = [f'I found {len(dup_pairs)} suspected duplicate receipt photo(s). '
f'Please review before I create the expense report:\n']
for kept_idx, dup_receipt, dup_parsed in dup_pairs:
kept_receipt, kept_parsed = deduped[kept_idx]
vendor = (dup_parsed.get('vendor') or kept_parsed.get('vendor', 'Unknown'))
amount = float(dup_parsed.get('amount', 0))
dt = dup_parsed.get('date', '')
time_a = kept_parsed.get('time') or ''
time_b = dup_parsed.get('time') or ''
line = f'{vendor} ${amount:.2f} on {dt}'
if time_a or time_b:
line += f' (Photo A at {time_a or "?"}, Photo B at {time_b or "?"})'
line += (f'\n Photo A: {kept_receipt.get("filename", "?")}'
f'\n Photo B: {dup_receipt.get("filename", "?")}')
lines.append(line)
lines.append(
'\nReply "skip duplicates" to keep the clearest photo of each, '
'or "keep all" to include every photo as a separate expense.'
)
return AgentReport(
directive_id=directive_id, agent=self.name, status='complete',
summary='\n'.join(lines), data=data,
escalations=[], actions_taken=[])
if data.get('mode') == 'create_from_receipts':
if self._actions_taken:
lines = '\n'.join(f'{a}' for a in self._actions_taken)
summary = (
f'Expense report created successfully:\n{lines}\n\n'
'The report is in draft. Please open Odoo > Expenses, '
'The report is in draft. Please open Odoo Expenses, '
'review the entries, and click Submit to send for approval.'
)
status = 'complete'