diff --git a/app/main.py b/app/main.py index 6046c18..31fc31e 100644 --- a/app/main.py +++ b/app/main.py @@ -490,7 +490,8 @@ def auto_resolve_exact(): for gid in groups: cur.execute(""" - SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime + SELECT f.id, f.path, f.width, f.height, f.file_size, + f.exif_datetime, f.file_mtime FROM duplicate_members dm JOIN files f ON f.id = dm.file_id WHERE dm.group_id = ? diff --git a/app/scanner.py b/app/scanner.py index 2ca4841..6e5a2b0 100644 --- a/app/scanner.py +++ b/app/scanner.py @@ -101,6 +101,7 @@ def init_db(): exif_device TEXT, width INTEGER, height INTEGER, + file_mtime TEXT, is_takeout INTEGER DEFAULT 0, is_edited INTEGER DEFAULT 0, takeout_json TEXT, @@ -169,6 +170,12 @@ def init_db(): cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}") except Exception: pass # column already exists + + # Migration: file_mtime added in v1.0.3 for keeper-selection scoring + try: + cur.execute("ALTER TABLE files ADD COLUMN file_mtime TEXT") + except Exception: + pass con.commit() # ── Detect interrupted scans from previous run ──────────────────────────── @@ -298,6 +305,7 @@ def extract_file(path: str) -> dict: "exif_device": None, "width": None, "height": None, + "file_mtime": _mtime_str(path), } try: @@ -356,17 +364,63 @@ class UnionFind: # ── Detection passes ────────────────────────────────────────────────────────── +# Folder-name signals: any path segment containing one of these tokens +# is treated as evidence the file is a duplicate copy, not the canonical original. +# Tokens are matched case-insensitively as substrings of each path segment, so +# "Trashed", "trash_old", "MyDups" all match. +_DUP_FOLDER_TOKENS = ( + "trash", "trashed", "dup", "dups", "duplicate", "duplicates", + "backup", "backups", "copy", "copies", "old", "archive", "archived", +) + + +def _path_penalty(path: str) -> int: + """Higher = worse keeper candidate. Penalises paths that look like copies/backups.""" + if not path: + return 0 + segments = [s for s in path.split("/") if s] + score = 0 + for seg in segments: + low = seg.lower() + for tok in _DUP_FOLDER_TOKENS: + if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok): + score += 100 + break + # Repeated segments like "Desktop/Desktop/Files" suggest a nested backup + seen: set[str] = set() + for seg in segments: + low = seg.lower() + if low in seen: + score += 30 + seen.add(low) + # Slight penalty for very deep paths (originals tend to live shallower) + score += max(0, len(segments) - 6) * 5 + return score + + def _suggested_keeper_by_resolution(members: list[dict]) -> int: - """Return file_id of best keeper: largest pixels, tie-break by file size, - final tie-break by oldest exif_datetime (likely the original).""" + """Return file_id of best keeper. + + Ranking, in order: + 1. Highest pixel count (tie → largest file_size) + 2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting) + 3. Earliest mtime (originals are usually older than their copies) + 4. Earliest exif_datetime + """ def res_size(m): return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0) top = max(res_size(m) for m in members) tied = [m for m in members if res_size(m) == top] - return min( - tied, key=lambda m: m.get("exif_datetime") or "9999-99-99T99:99:99" - )["id"] + + def rank(m): + return ( + _path_penalty(m.get("path") or ""), + m.get("file_mtime") or "9999", + m.get("exif_datetime") or "9999-99-99T99:99:99", + ) + + return min(tied, key=rank)["id"] def _suggested_keeper_oldest(members: list[dict]) -> int: @@ -388,7 +442,7 @@ def _run_sha256_pass(con: sqlite3.Connection, scan_id: int): for row in rows: sha = row["sha256"] cur.execute(""" - SELECT id, width, height, file_size, exif_datetime + SELECT id, path, width, height, file_size, exif_datetime, file_mtime FROM files WHERE sha256 = ? """, (sha,)) members = [dict(r) for r in cur.fetchall()] @@ -411,7 +465,8 @@ def _run_phash_pass(con: sqlite3.Connection, scan_id: int): cur = con.cursor() # Exclude files already in sha256 groups cur.execute(""" - SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime + SELECT f.id, f.path, f.phash, f.width, f.height, f.file_size, + f.exif_datetime, f.file_mtime FROM files f WHERE f.phash IS NOT NULL AND length(f.phash) = 16 @@ -508,7 +563,7 @@ def _run_exif_pass(con: sqlite3.Connection, scan_id: int): for row in rows: dt, dev = row["exif_datetime"], row["exif_device"] cur.execute(""" - SELECT id, width, height, file_size, exif_datetime + SELECT id, path, width, height, file_size, exif_datetime, file_mtime FROM files WHERE exif_datetime = ? AND exif_device = ? """, (dt, dev)) @@ -547,12 +602,13 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int): for row in rows: fs, w, h = row["file_size"], row["width"], row["height"] cur.execute(""" - SELECT id, width, height, file_size, exif_datetime + SELECT id, path, width, height, file_size, exif_datetime, file_mtime FROM files WHERE file_size = ? AND width = ? AND height = ? """, (fs, w, h)) members = [dict(r) for r in cur.fetchall()] - keeper_id = _suggested_keeper_oldest(members) + # Filesize+dim is the weakest signal — folder/mtime tiebreak helps a lot here + keeper_id = _suggested_keeper_by_resolution(members) method_value = f"{fs}::{w}x{h}" cur.execute( "INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)", @@ -677,7 +733,8 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): file_size=:file_size, mime_type=:mime_type, sha256=:sha256, exif_datetime=:exif_datetime, exif_device=:exif_device, width=:width, - height=:height, scan_id=:scan_id, + height=:height, file_mtime=:file_mtime, + scan_id=:scan_id, status='pending', updated_at=CURRENT_TIMESTAMP WHERE path=:path """, record) @@ -686,12 +743,12 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): INSERT OR IGNORE INTO files (path, filename, extension, file_size, mime_type, sha256, exif_datetime, exif_device, width, - height, scan_id, status) + height, file_mtime, scan_id, status) VALUES (:path, :filename, :extension, :file_size, :mime_type, :sha256, :exif_datetime, - :exif_device, :width, :height, :scan_id, - 'pending') + :exif_device, :width, :height, :file_mtime, + :scan_id, 'pending') """, record) with ThreadPoolExecutor(max_workers=N_WORKERS) as pool: diff --git a/debian/build-deb.sh b/debian/build-deb.sh index f96638c..8ed8ab6 100644 --- a/debian/build-deb.sh +++ b/debian/build-deb.sh @@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb" # ── Config ──────────────────────────────────────────────────────────────────── PKG_NAME="dupfinder" -PKG_VERSION="1.0.2" +PKG_VERSION="1.0.3" PKG_ARCH="amd64" DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"