Smarter keeper selection: folder-name + mtime signals

Adds a path-penalty score that downranks files in folders named Trashed, Dups, Backup, Copy, Old, Archive, plus a penalty for repeated path segments (e.g. Desktop/Desktop/Files) and very deep paths. Also captures and uses file mtime as a tiebreaker — older files are usually the originals. Applied to all four detection passes (sha256, phash, exif, filesize+dim) and to auto-resolve-exact. New file_mtime column with idempotent migration. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-24 10:56:52 -04:00
parent 4d57b0af74
commit 14c6012808
3 changed files with 74 additions and 16 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -490,7 +490,8 @@ def auto_resolve_exact():

    for gid in groups:
        cur.execute("""
-            SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
+            SELECT f.id, f.path, f.width, f.height, f.file_size,
+                   f.exif_datetime, f.file_mtime
            FROM duplicate_members dm
            JOIN files f ON f.id = dm.file_id
            WHERE dm.group_id = ?
--- a/app/scanner.py
+++ b/app/scanner.py
@@ -101,6 +101,7 @@ def init_db():
            exif_device     TEXT,
            width           INTEGER,
            height          INTEGER,
+            file_mtime      TEXT,
            is_takeout      INTEGER DEFAULT 0,
            is_edited       INTEGER DEFAULT 0,
            takeout_json    TEXT,
@@ -169,6 +170,12 @@ def init_db():
            cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}")
        except Exception:
            pass  # column already exists
+
+    # Migration: file_mtime added in v1.0.3 for keeper-selection scoring
+    try:
+        cur.execute("ALTER TABLE files ADD COLUMN file_mtime TEXT")
+    except Exception:
+        pass
    con.commit()

    # ── Detect interrupted scans from previous run ────────────────────────────
@@ -298,6 +305,7 @@ def extract_file(path: str) -> dict:
        "exif_device": None,
        "width": None,
        "height": None,
+        "file_mtime": _mtime_str(path),
    }

    try:
@@ -356,17 +364,63 @@ class UnionFind:

 # ── Detection passes ──────────────────────────────────────────────────────────

+# Folder-name signals: any path segment containing one of these tokens
+# is treated as evidence the file is a duplicate copy, not the canonical original.
+# Tokens are matched case-insensitively as substrings of each path segment, so
+# "Trashed", "trash_old", "MyDups" all match.
+_DUP_FOLDER_TOKENS = (
+    "trash", "trashed", "dup", "dups", "duplicate", "duplicates",
+    "backup", "backups", "copy", "copies", "old", "archive", "archived",
+)
+
+
+def _path_penalty(path: str) -> int:
+    """Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
+    if not path:
+        return 0
+    segments = [s for s in path.split("/") if s]
+    score = 0
+    for seg in segments:
+        low = seg.lower()
+        for tok in _DUP_FOLDER_TOKENS:
+            if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
+                score += 100
+                break
+        # Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
+    seen: set[str] = set()
+    for seg in segments:
+        low = seg.lower()
+        if low in seen:
+            score += 30
+        seen.add(low)
+    # Slight penalty for very deep paths (originals tend to live shallower)
+    score += max(0, len(segments) - 6) * 5
+    return score
+
+
 def _suggested_keeper_by_resolution(members: list[dict]) -> int:
-    """Return file_id of best keeper: largest pixels, tie-break by file size,
-    final tie-break by oldest exif_datetime (likely the original)."""
+    """Return file_id of best keeper.
+
+    Ranking, in order:
+      1. Highest pixel count (tie → largest file_size)
+      2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
+      3. Earliest mtime (originals are usually older than their copies)
+      4. Earliest exif_datetime
+    """
    def res_size(m):
        return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)

    top = max(res_size(m) for m in members)
    tied = [m for m in members if res_size(m) == top]
-    return min(
-        tied, key=lambda m: m.get("exif_datetime") or "9999-99-99T99:99:99"
-    )["id"]
+
+    def rank(m):
+        return (
+            _path_penalty(m.get("path") or ""),
+            m.get("file_mtime") or "9999",
+            m.get("exif_datetime") or "9999-99-99T99:99:99",
+        )
+
+    return min(tied, key=rank)["id"]


 def _suggested_keeper_oldest(members: list[dict]) -> int:
@@ -388,7 +442,7 @@ def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
    for row in rows:
        sha = row["sha256"]
        cur.execute("""
-            SELECT id, width, height, file_size, exif_datetime
+            SELECT id, path, width, height, file_size, exif_datetime, file_mtime
            FROM files WHERE sha256 = ?
        """, (sha,))
        members = [dict(r) for r in cur.fetchall()]
@@ -411,7 +465,8 @@ def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
    cur = con.cursor()
    # Exclude files already in sha256 groups
    cur.execute("""
-        SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
+        SELECT f.id, f.path, f.phash, f.width, f.height, f.file_size,
+               f.exif_datetime, f.file_mtime
        FROM files f
        WHERE f.phash IS NOT NULL
          AND length(f.phash) = 16
@@ -508,7 +563,7 @@ def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
    for row in rows:
        dt, dev = row["exif_datetime"], row["exif_device"]
        cur.execute("""
-            SELECT id, width, height, file_size, exif_datetime
+            SELECT id, path, width, height, file_size, exif_datetime, file_mtime
            FROM files
            WHERE exif_datetime = ? AND exif_device = ?
        """, (dt, dev))
@@ -547,12 +602,13 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
    for row in rows:
        fs, w, h = row["file_size"], row["width"], row["height"]
        cur.execute("""
-            SELECT id, width, height, file_size, exif_datetime
+            SELECT id, path, width, height, file_size, exif_datetime, file_mtime
            FROM files
            WHERE file_size = ? AND width = ? AND height = ?
        """, (fs, w, h))
        members = [dict(r) for r in cur.fetchall()]
-        keeper_id = _suggested_keeper_oldest(members)
+        # Filesize+dim is the weakest signal — folder/mtime tiebreak helps a lot here
+        keeper_id = _suggested_keeper_by_resolution(members)
        method_value = f"{fs}::{w}x{h}"
        cur.execute(
            "INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
@@ -677,7 +733,8 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
                            file_size=:file_size, mime_type=:mime_type,
                            sha256=:sha256, exif_datetime=:exif_datetime,
                            exif_device=:exif_device, width=:width,
-                            height=:height, scan_id=:scan_id,
+                            height=:height, file_mtime=:file_mtime,
+                            scan_id=:scan_id,
                            status='pending', updated_at=CURRENT_TIMESTAMP
                        WHERE path=:path
                    """, record)
@@ -686,12 +743,12 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
                        INSERT OR IGNORE INTO files
                            (path, filename, extension, file_size, mime_type,
                             sha256, exif_datetime, exif_device, width,
-                             height, scan_id, status)
+                             height, file_mtime, scan_id, status)
                        VALUES
                            (:path, :filename, :extension, :file_size,
                             :mime_type, :sha256, :exif_datetime,
-                             :exif_device, :width, :height, :scan_id,
-                             'pending')
+                             :exif_device, :width, :height, :file_mtime,
+                             :scan_id, 'pending')
                    """, record)

        with ThreadPoolExecutor(max_workers=N_WORKERS) as pool: