Smarter keeper selection: folder-name + mtime signals

Adds a path-penalty score that downranks files in folders named Trashed,
Dups, Backup, Copy, Old, Archive, plus a penalty for repeated path segments
(e.g. Desktop/Desktop/Files) and very deep paths. Also captures and uses
file mtime as a tiebreaker — older files are usually the originals.

Applied to all four detection passes (sha256, phash, exif, filesize+dim)
and to auto-resolve-exact.

New file_mtime column with idempotent migration.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Carlos
2026-04-24 10:56:52 -04:00
parent 4d57b0af74
commit 14c6012808
3 changed files with 74 additions and 16 deletions

View File

@@ -490,7 +490,8 @@ def auto_resolve_exact():
for gid in groups: for gid in groups:
cur.execute(""" cur.execute("""
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime SELECT f.id, f.path, f.width, f.height, f.file_size,
f.exif_datetime, f.file_mtime
FROM duplicate_members dm FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id JOIN files f ON f.id = dm.file_id
WHERE dm.group_id = ? WHERE dm.group_id = ?

View File

@@ -101,6 +101,7 @@ def init_db():
exif_device TEXT, exif_device TEXT,
width INTEGER, width INTEGER,
height INTEGER, height INTEGER,
file_mtime TEXT,
is_takeout INTEGER DEFAULT 0, is_takeout INTEGER DEFAULT 0,
is_edited INTEGER DEFAULT 0, is_edited INTEGER DEFAULT 0,
takeout_json TEXT, takeout_json TEXT,
@@ -169,6 +170,12 @@ def init_db():
cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}") cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}")
except Exception: except Exception:
pass # column already exists pass # column already exists
# Migration: file_mtime added in v1.0.3 for keeper-selection scoring
try:
cur.execute("ALTER TABLE files ADD COLUMN file_mtime TEXT")
except Exception:
pass
con.commit() con.commit()
# ── Detect interrupted scans from previous run ──────────────────────────── # ── Detect interrupted scans from previous run ────────────────────────────
@@ -298,6 +305,7 @@ def extract_file(path: str) -> dict:
"exif_device": None, "exif_device": None,
"width": None, "width": None,
"height": None, "height": None,
"file_mtime": _mtime_str(path),
} }
try: try:
@@ -356,17 +364,63 @@ class UnionFind:
# ── Detection passes ────────────────────────────────────────────────────────── # ── Detection passes ──────────────────────────────────────────────────────────
# Folder-name signals: any path segment containing one of these tokens
# is treated as evidence the file is a duplicate copy, not the canonical original.
# Tokens are matched case-insensitively as substrings of each path segment, so
# "Trashed", "trash_old", "MyDups" all match.
_DUP_FOLDER_TOKENS = (
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
"backup", "backups", "copy", "copies", "old", "archive", "archived",
)
def _path_penalty(path: str) -> int:
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
if not path:
return 0
segments = [s for s in path.split("/") if s]
score = 0
for seg in segments:
low = seg.lower()
for tok in _DUP_FOLDER_TOKENS:
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
score += 100
break
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
seen: set[str] = set()
for seg in segments:
low = seg.lower()
if low in seen:
score += 30
seen.add(low)
# Slight penalty for very deep paths (originals tend to live shallower)
score += max(0, len(segments) - 6) * 5
return score
def _suggested_keeper_by_resolution(members: list[dict]) -> int: def _suggested_keeper_by_resolution(members: list[dict]) -> int:
"""Return file_id of best keeper: largest pixels, tie-break by file size, """Return file_id of best keeper.
final tie-break by oldest exif_datetime (likely the original)."""
Ranking, in order:
1. Highest pixel count (tie → largest file_size)
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
3. Earliest mtime (originals are usually older than their copies)
4. Earliest exif_datetime
"""
def res_size(m): def res_size(m):
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0) return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
top = max(res_size(m) for m in members) top = max(res_size(m) for m in members)
tied = [m for m in members if res_size(m) == top] tied = [m for m in members if res_size(m) == top]
return min(
tied, key=lambda m: m.get("exif_datetime") or "9999-99-99T99:99:99" def rank(m):
)["id"] return (
_path_penalty(m.get("path") or ""),
m.get("file_mtime") or "9999",
m.get("exif_datetime") or "9999-99-99T99:99:99",
)
return min(tied, key=rank)["id"]
def _suggested_keeper_oldest(members: list[dict]) -> int: def _suggested_keeper_oldest(members: list[dict]) -> int:
@@ -388,7 +442,7 @@ def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
for row in rows: for row in rows:
sha = row["sha256"] sha = row["sha256"]
cur.execute(""" cur.execute("""
SELECT id, width, height, file_size, exif_datetime SELECT id, path, width, height, file_size, exif_datetime, file_mtime
FROM files WHERE sha256 = ? FROM files WHERE sha256 = ?
""", (sha,)) """, (sha,))
members = [dict(r) for r in cur.fetchall()] members = [dict(r) for r in cur.fetchall()]
@@ -411,7 +465,8 @@ def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
cur = con.cursor() cur = con.cursor()
# Exclude files already in sha256 groups # Exclude files already in sha256 groups
cur.execute(""" cur.execute("""
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime SELECT f.id, f.path, f.phash, f.width, f.height, f.file_size,
f.exif_datetime, f.file_mtime
FROM files f FROM files f
WHERE f.phash IS NOT NULL WHERE f.phash IS NOT NULL
AND length(f.phash) = 16 AND length(f.phash) = 16
@@ -508,7 +563,7 @@ def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
for row in rows: for row in rows:
dt, dev = row["exif_datetime"], row["exif_device"] dt, dev = row["exif_datetime"], row["exif_device"]
cur.execute(""" cur.execute("""
SELECT id, width, height, file_size, exif_datetime SELECT id, path, width, height, file_size, exif_datetime, file_mtime
FROM files FROM files
WHERE exif_datetime = ? AND exif_device = ? WHERE exif_datetime = ? AND exif_device = ?
""", (dt, dev)) """, (dt, dev))
@@ -547,12 +602,13 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
for row in rows: for row in rows:
fs, w, h = row["file_size"], row["width"], row["height"] fs, w, h = row["file_size"], row["width"], row["height"]
cur.execute(""" cur.execute("""
SELECT id, width, height, file_size, exif_datetime SELECT id, path, width, height, file_size, exif_datetime, file_mtime
FROM files FROM files
WHERE file_size = ? AND width = ? AND height = ? WHERE file_size = ? AND width = ? AND height = ?
""", (fs, w, h)) """, (fs, w, h))
members = [dict(r) for r in cur.fetchall()] members = [dict(r) for r in cur.fetchall()]
keeper_id = _suggested_keeper_oldest(members) # Filesize+dim is the weakest signal — folder/mtime tiebreak helps a lot here
keeper_id = _suggested_keeper_by_resolution(members)
method_value = f"{fs}::{w}x{h}" method_value = f"{fs}::{w}x{h}"
cur.execute( cur.execute(
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)", "INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
@@ -677,7 +733,8 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
file_size=:file_size, mime_type=:mime_type, file_size=:file_size, mime_type=:mime_type,
sha256=:sha256, exif_datetime=:exif_datetime, sha256=:sha256, exif_datetime=:exif_datetime,
exif_device=:exif_device, width=:width, exif_device=:exif_device, width=:width,
height=:height, scan_id=:scan_id, height=:height, file_mtime=:file_mtime,
scan_id=:scan_id,
status='pending', updated_at=CURRENT_TIMESTAMP status='pending', updated_at=CURRENT_TIMESTAMP
WHERE path=:path WHERE path=:path
""", record) """, record)
@@ -686,12 +743,12 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
INSERT OR IGNORE INTO files INSERT OR IGNORE INTO files
(path, filename, extension, file_size, mime_type, (path, filename, extension, file_size, mime_type,
sha256, exif_datetime, exif_device, width, sha256, exif_datetime, exif_device, width,
height, scan_id, status) height, file_mtime, scan_id, status)
VALUES VALUES
(:path, :filename, :extension, :file_size, (:path, :filename, :extension, :file_size,
:mime_type, :sha256, :exif_datetime, :mime_type, :sha256, :exif_datetime,
:exif_device, :width, :height, :scan_id, :exif_device, :width, :height, :file_mtime,
'pending') :scan_id, 'pending')
""", record) """, record)
with ThreadPoolExecutor(max_workers=N_WORKERS) as pool: with ThreadPoolExecutor(max_workers=N_WORKERS) as pool:

2
debian/build-deb.sh vendored
View File

@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
# ── Config ──────────────────────────────────────────────────────────────────── # ── Config ────────────────────────────────────────────────────────────────────
PKG_NAME="dupfinder" PKG_NAME="dupfinder"
PKG_VERSION="1.0.2" PKG_VERSION="1.0.3"
PKG_ARCH="amd64" PKG_ARCH="amd64"
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb" DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"