Smarter keeper selection: folder-name + mtime signals
Adds a path-penalty score that downranks files in folders named Trashed, Dups, Backup, Copy, Old, Archive, plus a penalty for repeated path segments (e.g. Desktop/Desktop/Files) and very deep paths. Also captures and uses file mtime as a tiebreaker — older files are usually the originals. Applied to all four detection passes (sha256, phash, exif, filesize+dim) and to auto-resolve-exact. New file_mtime column with idempotent migration. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -490,7 +490,8 @@ def auto_resolve_exact():
|
||||
|
||||
for gid in groups:
|
||||
cur.execute("""
|
||||
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
|
||||
SELECT f.id, f.path, f.width, f.height, f.file_size,
|
||||
f.exif_datetime, f.file_mtime
|
||||
FROM duplicate_members dm
|
||||
JOIN files f ON f.id = dm.file_id
|
||||
WHERE dm.group_id = ?
|
||||
|
||||
@@ -101,6 +101,7 @@ def init_db():
|
||||
exif_device TEXT,
|
||||
width INTEGER,
|
||||
height INTEGER,
|
||||
file_mtime TEXT,
|
||||
is_takeout INTEGER DEFAULT 0,
|
||||
is_edited INTEGER DEFAULT 0,
|
||||
takeout_json TEXT,
|
||||
@@ -169,6 +170,12 @@ def init_db():
|
||||
cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}")
|
||||
except Exception:
|
||||
pass # column already exists
|
||||
|
||||
# Migration: file_mtime added in v1.0.3 for keeper-selection scoring
|
||||
try:
|
||||
cur.execute("ALTER TABLE files ADD COLUMN file_mtime TEXT")
|
||||
except Exception:
|
||||
pass
|
||||
con.commit()
|
||||
|
||||
# ── Detect interrupted scans from previous run ────────────────────────────
|
||||
@@ -298,6 +305,7 @@ def extract_file(path: str) -> dict:
|
||||
"exif_device": None,
|
||||
"width": None,
|
||||
"height": None,
|
||||
"file_mtime": _mtime_str(path),
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -356,17 +364,63 @@ class UnionFind:
|
||||
|
||||
# ── Detection passes ──────────────────────────────────────────────────────────
|
||||
|
||||
# Folder-name signals: any path segment containing one of these tokens
|
||||
# is treated as evidence the file is a duplicate copy, not the canonical original.
|
||||
# Tokens are matched case-insensitively as substrings of each path segment, so
|
||||
# "Trashed", "trash_old", "MyDups" all match.
|
||||
_DUP_FOLDER_TOKENS = (
|
||||
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
|
||||
"backup", "backups", "copy", "copies", "old", "archive", "archived",
|
||||
)
|
||||
|
||||
|
||||
def _path_penalty(path: str) -> int:
|
||||
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
|
||||
if not path:
|
||||
return 0
|
||||
segments = [s for s in path.split("/") if s]
|
||||
score = 0
|
||||
for seg in segments:
|
||||
low = seg.lower()
|
||||
for tok in _DUP_FOLDER_TOKENS:
|
||||
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
|
||||
score += 100
|
||||
break
|
||||
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
|
||||
seen: set[str] = set()
|
||||
for seg in segments:
|
||||
low = seg.lower()
|
||||
if low in seen:
|
||||
score += 30
|
||||
seen.add(low)
|
||||
# Slight penalty for very deep paths (originals tend to live shallower)
|
||||
score += max(0, len(segments) - 6) * 5
|
||||
return score
|
||||
|
||||
|
||||
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
||||
"""Return file_id of best keeper: largest pixels, tie-break by file size,
|
||||
final tie-break by oldest exif_datetime (likely the original)."""
|
||||
"""Return file_id of best keeper.
|
||||
|
||||
Ranking, in order:
|
||||
1. Highest pixel count (tie → largest file_size)
|
||||
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
|
||||
3. Earliest mtime (originals are usually older than their copies)
|
||||
4. Earliest exif_datetime
|
||||
"""
|
||||
def res_size(m):
|
||||
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
|
||||
|
||||
top = max(res_size(m) for m in members)
|
||||
tied = [m for m in members if res_size(m) == top]
|
||||
return min(
|
||||
tied, key=lambda m: m.get("exif_datetime") or "9999-99-99T99:99:99"
|
||||
)["id"]
|
||||
|
||||
def rank(m):
|
||||
return (
|
||||
_path_penalty(m.get("path") or ""),
|
||||
m.get("file_mtime") or "9999",
|
||||
m.get("exif_datetime") or "9999-99-99T99:99:99",
|
||||
)
|
||||
|
||||
return min(tied, key=rank)["id"]
|
||||
|
||||
|
||||
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
||||
@@ -388,7 +442,7 @@ def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
|
||||
for row in rows:
|
||||
sha = row["sha256"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
|
||||
FROM files WHERE sha256 = ?
|
||||
""", (sha,))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
@@ -411,7 +465,8 @@ def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
# Exclude files already in sha256 groups
|
||||
cur.execute("""
|
||||
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
|
||||
SELECT f.id, f.path, f.phash, f.width, f.height, f.file_size,
|
||||
f.exif_datetime, f.file_mtime
|
||||
FROM files f
|
||||
WHERE f.phash IS NOT NULL
|
||||
AND length(f.phash) = 16
|
||||
@@ -508,7 +563,7 @@ def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
|
||||
for row in rows:
|
||||
dt, dev = row["exif_datetime"], row["exif_device"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
|
||||
FROM files
|
||||
WHERE exif_datetime = ? AND exif_device = ?
|
||||
""", (dt, dev))
|
||||
@@ -547,12 +602,13 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
|
||||
for row in rows:
|
||||
fs, w, h = row["file_size"], row["width"], row["height"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
|
||||
FROM files
|
||||
WHERE file_size = ? AND width = ? AND height = ?
|
||||
""", (fs, w, h))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
keeper_id = _suggested_keeper_oldest(members)
|
||||
# Filesize+dim is the weakest signal — folder/mtime tiebreak helps a lot here
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
method_value = f"{fs}::{w}x{h}"
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
|
||||
@@ -677,7 +733,8 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
file_size=:file_size, mime_type=:mime_type,
|
||||
sha256=:sha256, exif_datetime=:exif_datetime,
|
||||
exif_device=:exif_device, width=:width,
|
||||
height=:height, scan_id=:scan_id,
|
||||
height=:height, file_mtime=:file_mtime,
|
||||
scan_id=:scan_id,
|
||||
status='pending', updated_at=CURRENT_TIMESTAMP
|
||||
WHERE path=:path
|
||||
""", record)
|
||||
@@ -686,12 +743,12 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
INSERT OR IGNORE INTO files
|
||||
(path, filename, extension, file_size, mime_type,
|
||||
sha256, exif_datetime, exif_device, width,
|
||||
height, scan_id, status)
|
||||
height, file_mtime, scan_id, status)
|
||||
VALUES
|
||||
(:path, :filename, :extension, :file_size,
|
||||
:mime_type, :sha256, :exif_datetime,
|
||||
:exif_device, :width, :height, :scan_id,
|
||||
'pending')
|
||||
:exif_device, :width, :height, :file_mtime,
|
||||
:scan_id, 'pending')
|
||||
""", record)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=N_WORKERS) as pool:
|
||||
|
||||
Reference in New Issue
Block a user