Smarter keeper selection: folder-name + mtime signals

Adds a path-penalty score that downranks files in folders named Trashed,
Dups, Backup, Copy, Old, Archive, plus a penalty for repeated path segments
(e.g. Desktop/Desktop/Files) and very deep paths. Also captures and uses
file mtime as a tiebreaker — older files are usually the originals.

Applied to all four detection passes (sha256, phash, exif, filesize+dim)
and to auto-resolve-exact.

New file_mtime column with idempotent migration.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Carlos
2026-04-24 10:56:52 -04:00
parent 4d57b0af74
commit 14c6012808
3 changed files with 74 additions and 16 deletions

View File

@@ -490,7 +490,8 @@ def auto_resolve_exact():
for gid in groups:
cur.execute("""
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
SELECT f.id, f.path, f.width, f.height, f.file_size,
f.exif_datetime, f.file_mtime
FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id
WHERE dm.group_id = ?

View File

@@ -101,6 +101,7 @@ def init_db():
exif_device TEXT,
width INTEGER,
height INTEGER,
file_mtime TEXT,
is_takeout INTEGER DEFAULT 0,
is_edited INTEGER DEFAULT 0,
takeout_json TEXT,
@@ -169,6 +170,12 @@ def init_db():
cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}")
except Exception:
pass # column already exists
# Migration: file_mtime added in v1.0.3 for keeper-selection scoring
try:
cur.execute("ALTER TABLE files ADD COLUMN file_mtime TEXT")
except Exception:
pass
con.commit()
# ── Detect interrupted scans from previous run ────────────────────────────
@@ -298,6 +305,7 @@ def extract_file(path: str) -> dict:
"exif_device": None,
"width": None,
"height": None,
"file_mtime": _mtime_str(path),
}
try:
@@ -356,17 +364,63 @@ class UnionFind:
# ── Detection passes ──────────────────────────────────────────────────────────
# Folder-name signals: any path segment containing one of these tokens
# is treated as evidence the file is a duplicate copy, not the canonical original.
# Tokens are matched case-insensitively as substrings of each path segment, so
# "Trashed", "trash_old", "MyDups" all match.
_DUP_FOLDER_TOKENS = (
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
"backup", "backups", "copy", "copies", "old", "archive", "archived",
)
def _path_penalty(path: str) -> int:
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
if not path:
return 0
segments = [s for s in path.split("/") if s]
score = 0
for seg in segments:
low = seg.lower()
for tok in _DUP_FOLDER_TOKENS:
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
score += 100
break
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
seen: set[str] = set()
for seg in segments:
low = seg.lower()
if low in seen:
score += 30
seen.add(low)
# Slight penalty for very deep paths (originals tend to live shallower)
score += max(0, len(segments) - 6) * 5
return score
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
"""Return file_id of best keeper: largest pixels, tie-break by file size,
final tie-break by oldest exif_datetime (likely the original)."""
"""Return file_id of best keeper.
Ranking, in order:
1. Highest pixel count (tie → largest file_size)
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
3. Earliest mtime (originals are usually older than their copies)
4. Earliest exif_datetime
"""
def res_size(m):
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
top = max(res_size(m) for m in members)
tied = [m for m in members if res_size(m) == top]
return min(
tied, key=lambda m: m.get("exif_datetime") or "9999-99-99T99:99:99"
)["id"]
def rank(m):
return (
_path_penalty(m.get("path") or ""),
m.get("file_mtime") or "9999",
m.get("exif_datetime") or "9999-99-99T99:99:99",
)
return min(tied, key=rank)["id"]
def _suggested_keeper_oldest(members: list[dict]) -> int:
@@ -388,7 +442,7 @@ def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
for row in rows:
sha = row["sha256"]
cur.execute("""
SELECT id, width, height, file_size, exif_datetime
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
FROM files WHERE sha256 = ?
""", (sha,))
members = [dict(r) for r in cur.fetchall()]
@@ -411,7 +465,8 @@ def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
cur = con.cursor()
# Exclude files already in sha256 groups
cur.execute("""
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
SELECT f.id, f.path, f.phash, f.width, f.height, f.file_size,
f.exif_datetime, f.file_mtime
FROM files f
WHERE f.phash IS NOT NULL
AND length(f.phash) = 16
@@ -508,7 +563,7 @@ def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
for row in rows:
dt, dev = row["exif_datetime"], row["exif_device"]
cur.execute("""
SELECT id, width, height, file_size, exif_datetime
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
FROM files
WHERE exif_datetime = ? AND exif_device = ?
""", (dt, dev))
@@ -547,12 +602,13 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
for row in rows:
fs, w, h = row["file_size"], row["width"], row["height"]
cur.execute("""
SELECT id, width, height, file_size, exif_datetime
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
FROM files
WHERE file_size = ? AND width = ? AND height = ?
""", (fs, w, h))
members = [dict(r) for r in cur.fetchall()]
keeper_id = _suggested_keeper_oldest(members)
# Filesize+dim is the weakest signal — folder/mtime tiebreak helps a lot here
keeper_id = _suggested_keeper_by_resolution(members)
method_value = f"{fs}::{w}x{h}"
cur.execute(
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
@@ -677,7 +733,8 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
file_size=:file_size, mime_type=:mime_type,
sha256=:sha256, exif_datetime=:exif_datetime,
exif_device=:exif_device, width=:width,
height=:height, scan_id=:scan_id,
height=:height, file_mtime=:file_mtime,
scan_id=:scan_id,
status='pending', updated_at=CURRENT_TIMESTAMP
WHERE path=:path
""", record)
@@ -686,12 +743,12 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
INSERT OR IGNORE INTO files
(path, filename, extension, file_size, mime_type,
sha256, exif_datetime, exif_device, width,
height, scan_id, status)
height, file_mtime, scan_id, status)
VALUES
(:path, :filename, :extension, :file_size,
:mime_type, :sha256, :exif_datetime,
:exif_device, :width, :height, :scan_id,
'pending')
:exif_device, :width, :height, :file_mtime,
:scan_id, 'pending')
""", record)
with ThreadPoolExecutor(max_workers=N_WORKERS) as pool: