Add duplicate approval flow with time-based dedup
- expenses_agent: extract transaction time (HH:MM) from OCR receipt text - expenses_agent: _find_semantic_duplicate uses time to rule out false positives (>30 min apart = different receipts) - expenses_agent: pause when duplicates found, set mode=awaiting_dup_approval, ask user before creating sheet - expenses_agent: _report formats approval message listing each dup pair with vendor/amount/date/times/filenames - ab_ai_mail: _find_pending_attachments recognises dup-approval bot message so ZIP re-attaches on user reply Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -191,12 +191,18 @@ class DiscussChannel(models.Model):
|
||||
(i.e. the bot hasn't already acted on those files).
|
||||
"""
|
||||
messages = self.message_ids.sorted('date', reverse=True)[:_LOOKBACK_MESSAGES]
|
||||
_bot_question_phrases = (
|
||||
'what would you like me to do',
|
||||
'suspected duplicate',
|
||||
'skip duplicates',
|
||||
'keep all',
|
||||
)
|
||||
prev_was_bot_question = False
|
||||
for msg in messages:
|
||||
is_bot = msg.author_id == bot_partner
|
||||
if is_bot:
|
||||
# Check whether this bot message was a clarification question
|
||||
if 'what would you like me to do' in (msg.body or '').lower():
|
||||
body_lower = (msg.body or '').lower()
|
||||
if any(p in body_lower for p in _bot_question_phrases):
|
||||
prev_was_bot_question = True
|
||||
continue
|
||||
# Human message
|
||||
|
||||
@@ -57,8 +57,20 @@ class ExpensesAgent(BaseAgent):
|
||||
async def _plan(self) -> dict:
|
||||
task = (self._directive.task if self._directive else '').lower()
|
||||
receipts = getattr(self._directive.context, 'receipts', []) if self._directive else []
|
||||
|
||||
# Detect whether the user is responding to a duplicate-approval request
|
||||
skip_keywords = ('skip', 'yes', 'remove duplicate', 'exclude duplicate', 'drop duplicate')
|
||||
keep_keywords = ('keep all', 'keep both', 'include all', 'no skip', "don't skip")
|
||||
if any(k in task for k in skip_keywords):
|
||||
user_dup_decision = 'skip'
|
||||
elif any(k in task for k in keep_keywords):
|
||||
user_dup_decision = 'keep_all'
|
||||
else:
|
||||
user_dup_decision = 'none' # first time through — will ask if dups found
|
||||
|
||||
return {
|
||||
'mode': 'create_from_receipts' if receipts else 'read',
|
||||
'user_dup_decision': user_dup_decision,
|
||||
'fetch_summary': any(k in task for k in ('summary', 'overview')) and not receipts,
|
||||
'fetch_pending': any(k in task for k in ('pending', 'approve', 'approval')) and not receipts,
|
||||
'employee_id': self._directive.params.get('employee_id') if self._directive else None,
|
||||
@@ -67,7 +79,8 @@ class ExpensesAgent(BaseAgent):
|
||||
}
|
||||
|
||||
async def _gather(self, plan: dict) -> dict:
|
||||
data: dict = {'mode': plan.get('mode', 'read')}
|
||||
data: dict = {'mode': plan.get('mode', 'read'),
|
||||
'user_dup_decision': plan.get('user_dup_decision', 'none')}
|
||||
if plan.get('mode') == 'create_from_receipts':
|
||||
self._gathered_data = data
|
||||
return data
|
||||
@@ -100,6 +113,8 @@ class ExpensesAgent(BaseAgent):
|
||||
if not receipts:
|
||||
return []
|
||||
|
||||
user_dup_decision = self._gathered_data.get('user_dup_decision', 'none')
|
||||
|
||||
user_id = (self._directive.context.peer_data.get('requesting_user_id')
|
||||
if self._directive else None)
|
||||
employee_id = await self._et.get_employee_id_for_user(user_id)
|
||||
@@ -108,30 +123,20 @@ class ExpensesAgent(BaseAgent):
|
||||
'No employee record found for the current user; cannot create expense report.')
|
||||
return []
|
||||
|
||||
sheet_name = f'Expense Report - {_date.today().isoformat()}'
|
||||
sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
|
||||
if not sheet_result.success:
|
||||
self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
|
||||
return []
|
||||
|
||||
sheet_id = sheet_result.record_id
|
||||
actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
|
||||
|
||||
# Fetch all expensable products once for category selection
|
||||
expense_products = await self._et.get_expense_products()
|
||||
default_product_id = expense_products[0]['id'] if expense_products else None
|
||||
product_map = {p['id']: p['name'] for p in expense_products}
|
||||
|
||||
# Deduplicate receipts by SHA256 hash — same image uploaded twice
|
||||
# Pass 1: byte-exact dedup (same file uploaded twice)
|
||||
seen_hashes: set = set()
|
||||
unique_receipts = []
|
||||
for r in receipts:
|
||||
h = r.get('sha256')
|
||||
if h and h in seen_hashes:
|
||||
logger.info('expenses_agent: skipping byte-identical receipt %s', r.get('filename'))
|
||||
continue
|
||||
if h:
|
||||
if h in seen_hashes:
|
||||
logger.info('expenses_agent: skipping duplicate receipt %s', r.get('filename'))
|
||||
actions.append(f"Skipped duplicate: {r.get('filename', 'receipt')}")
|
||||
continue
|
||||
seen_hashes.add(h)
|
||||
unique_receipts.append(r)
|
||||
|
||||
@@ -154,30 +159,46 @@ class ExpensesAgent(BaseAgent):
|
||||
receipt.get('filename'), parsed)
|
||||
parsed = {'vendor': receipt.get('filename', 'Expense'), 'amount': 0.0,
|
||||
'date': receipt.get('date_from_name') or _date.today().isoformat(),
|
||||
'product_name': ''}
|
||||
'time': None, 'product_name': ''}
|
||||
paired.append((receipt, parsed))
|
||||
|
||||
# Semantic dedup — different photos of the same physical receipt share
|
||||
# the same amount, date, and a similar vendor name.
|
||||
# Pass 2: semantic dedup — detect multiple photos of the same receipt
|
||||
deduped: list[tuple[dict, dict]] = []
|
||||
dup_pairs: list[tuple[int, dict, dict]] = [] # (kept_idx, dup_receipt, dup_parsed)
|
||||
for receipt, parsed in paired:
|
||||
dup_idx = self._find_semantic_duplicate(parsed, deduped)
|
||||
if dup_idx is not None:
|
||||
# Keep whichever photo produced more OCR text (clearer shot)
|
||||
existing_receipt, _ = deduped[dup_idx]
|
||||
if len(receipt.get('text', '')) > len(existing_receipt.get('text', '')):
|
||||
dup_pairs.append((dup_idx, receipt, parsed))
|
||||
# Tentatively keep whichever photo had more OCR text
|
||||
if len(receipt.get('text', '')) > len(deduped[dup_idx][0].get('text', '')):
|
||||
deduped[dup_idx] = (receipt, parsed)
|
||||
actions.append(
|
||||
f"Skipped duplicate photo of "
|
||||
f"{parsed.get('vendor', receipt.get('filename', 'receipt'))}"
|
||||
f" ${float(parsed.get('amount', 0)):.2f}"
|
||||
)
|
||||
logger.info('expenses_agent: semantic duplicate %s skipped',
|
||||
receipt.get('filename'))
|
||||
else:
|
||||
deduped.append((receipt, parsed))
|
||||
|
||||
for receipt, parsed in deduped:
|
||||
# If duplicates were found and user hasn't decided yet, pause and ask
|
||||
if dup_pairs and user_dup_decision == 'none':
|
||||
self._gathered_data['mode'] = 'awaiting_dup_approval'
|
||||
self._pending_dup_pairs = dup_pairs
|
||||
self._deduped = deduped
|
||||
return []
|
||||
|
||||
# Apply user's decision
|
||||
if user_dup_decision == 'keep_all':
|
||||
final_list = paired
|
||||
else:
|
||||
final_list = deduped # default: skip semantic duplicates
|
||||
|
||||
# Create the sheet now that we know what to include
|
||||
sheet_name = f'Expense Report - {_date.today().isoformat()}'
|
||||
sheet_result = await self._et.create_expense_sheet(sheet_name, employee_id)
|
||||
if not sheet_result.success:
|
||||
self._escalations_list.append(f'Failed to create expense sheet: {sheet_result.error}')
|
||||
return []
|
||||
|
||||
sheet_id = sheet_result.record_id
|
||||
actions = [f'Created expense sheet "{sheet_name}" (ID {sheet_id})']
|
||||
|
||||
for receipt, parsed in final_list:
|
||||
|
||||
# Pick product by name match returned from LLM, fall back to default
|
||||
product_id = default_product_id
|
||||
@@ -228,12 +249,14 @@ class ExpensesAgent(BaseAgent):
|
||||
Match criteria (all must pass):
|
||||
1. Same date
|
||||
2. Amount > 0 and within $0.05 of each other
|
||||
3. Vendor name similarity >= 60 % (or both vendors are raw filenames)
|
||||
3. Transaction times within 30 min of each other (if both present);
|
||||
times > 30 min apart rule out a duplicate
|
||||
4. Vendor name similarity >= 60 % (or both vendors are raw filenames)
|
||||
"""
|
||||
amt = float(parsed.get('amount', 0))
|
||||
date = parsed.get('date', '')
|
||||
time = parsed.get('time') # HH:MM or None
|
||||
vendor = str(parsed.get('vendor', '')).lower().strip()
|
||||
# If OCR failed the vendor is just a filename — can't dedup by content
|
||||
is_filename = vendor.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||
|
||||
for idx, (_, other) in enumerate(candidates):
|
||||
@@ -245,6 +268,17 @@ class ExpensesAgent(BaseAgent):
|
||||
continue
|
||||
if date != other.get('date', ''):
|
||||
continue
|
||||
# Time check: if both receipts have a transaction time and they are
|
||||
# more than 30 minutes apart they are different transactions.
|
||||
other_time = other.get('time')
|
||||
if time and other_time:
|
||||
try:
|
||||
h1, m1 = (int(p) for p in time.split(':')[:2])
|
||||
h2, m2 = (int(p) for p in other_time.split(':')[:2])
|
||||
if abs((h1 * 60 + m1) - (h2 * 60 + m2)) > 30:
|
||||
continue
|
||||
except Exception:
|
||||
pass # unparseable time — ignore the signal
|
||||
other_vendor = str(other.get('vendor', '')).lower().strip()
|
||||
other_is_filename = other_vendor.endswith(
|
||||
('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
|
||||
@@ -261,7 +295,7 @@ class ExpensesAgent(BaseAgent):
|
||||
date_hint: str = None) -> dict:
|
||||
today = _date.today().isoformat()
|
||||
fallback = {'vendor': filename, 'amount': 0.0,
|
||||
'date': date_hint or today, 'product_name': ''}
|
||||
'date': date_hint or today, 'time': None, 'product_name': ''}
|
||||
ocr_failed = not text or text.startswith('[')
|
||||
|
||||
product_list = ''
|
||||
@@ -289,6 +323,8 @@ class ExpensesAgent(BaseAgent):
|
||||
'do NOT use subtotal, tax, or tip separately; '
|
||||
'if multiple totals appear pick the largest one labeled as the final total),\n'
|
||||
f'"date" (string YYYY-MM-DD, use {date_hint or today} if not found in text),\n'
|
||||
'"time" (string HH:MM in 24-hour format — the transaction time printed on the receipt; '
|
||||
'null if not present),\n'
|
||||
f'"product_name" (string, pick the best match from [{product_list}] or empty string).\n\n'
|
||||
f'Receipt text:\n{text[:2000]}\n\nJSON only:'
|
||||
)
|
||||
@@ -305,6 +341,7 @@ class ExpensesAgent(BaseAgent):
|
||||
'vendor': str(data.get('vendor', filename)),
|
||||
'amount': float(data.get('amount', 0.0)),
|
||||
'date': str(data.get('date') or date_hint or today),
|
||||
'time': data.get('time') or None,
|
||||
'product_name': str(data.get('product_name', '')),
|
||||
}
|
||||
except Exception as exc:
|
||||
@@ -315,12 +352,39 @@ class ExpensesAgent(BaseAgent):
|
||||
data = self._gathered_data
|
||||
directive_id = self._directive.directive_id if self._directive else ''
|
||||
|
||||
if data.get('mode') == 'awaiting_dup_approval':
|
||||
dup_pairs = getattr(self, '_pending_dup_pairs', [])
|
||||
deduped = getattr(self, '_deduped', [])
|
||||
lines = [f'I found {len(dup_pairs)} suspected duplicate receipt photo(s). '
|
||||
f'Please review before I create the expense report:\n']
|
||||
for kept_idx, dup_receipt, dup_parsed in dup_pairs:
|
||||
kept_receipt, kept_parsed = deduped[kept_idx]
|
||||
vendor = (dup_parsed.get('vendor') or kept_parsed.get('vendor', 'Unknown'))
|
||||
amount = float(dup_parsed.get('amount', 0))
|
||||
dt = dup_parsed.get('date', '')
|
||||
time_a = kept_parsed.get('time') or ''
|
||||
time_b = dup_parsed.get('time') or ''
|
||||
line = f'• {vendor} ${amount:.2f} on {dt}'
|
||||
if time_a or time_b:
|
||||
line += f' (Photo A at {time_a or "?"}, Photo B at {time_b or "?"})'
|
||||
line += (f'\n Photo A: {kept_receipt.get("filename", "?")}'
|
||||
f'\n Photo B: {dup_receipt.get("filename", "?")}')
|
||||
lines.append(line)
|
||||
lines.append(
|
||||
'\nReply "skip duplicates" to keep the clearest photo of each, '
|
||||
'or "keep all" to include every photo as a separate expense.'
|
||||
)
|
||||
return AgentReport(
|
||||
directive_id=directive_id, agent=self.name, status='complete',
|
||||
summary='\n'.join(lines), data=data,
|
||||
escalations=[], actions_taken=[])
|
||||
|
||||
if data.get('mode') == 'create_from_receipts':
|
||||
if self._actions_taken:
|
||||
lines = '\n'.join(f' • {a}' for a in self._actions_taken)
|
||||
summary = (
|
||||
f'Expense report created successfully:\n{lines}\n\n'
|
||||
'The report is in draft. Please open Odoo > Expenses, '
|
||||
'The report is in draft. Please open Odoo › Expenses, '
|
||||
'review the entries, and click Submit to send for approval.'
|
||||
)
|
||||
status = 'complete'
|
||||
|
||||
Reference in New Issue
Block a user