From 399a80cb7034b12e2578a3cb604f4d1eeec7a4e7 Mon Sep 17 00:00:00 2001 From: Carlos Date: Sun, 26 Apr 2026 15:52:04 -0400 Subject: [PATCH] Add explicit folder-priority ranking for keeper selection #recycle (10) ranks worst, MobileBackup (1) best, default 2. Folder priority dominates resolution + path-penalty; mtime stays as final tiebreak. Override via /data/folder_priority.json (cached per process). Co-Authored-By: Claude Opus 4.7 --- app/scanner.py | 93 +++++++++++++++++++++++++++++++++++++-------- debian/build-deb.sh | 2 +- 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/app/scanner.py b/app/scanner.py index 6e5a2b0..c8e8358 100644 --- a/app/scanner.py +++ b/app/scanner.py @@ -364,10 +364,71 @@ class UnionFind: # ── Detection passes ────────────────────────────────────────────────────────── -# Folder-name signals: any path segment containing one of these tokens -# is treated as evidence the file is a duplicate copy, not the canonical original. -# Tokens are matched case-insensitively as substrings of each path segment, so -# "Trashed", "trash_old", "MyDups" all match. +# Explicit folder-priority ranking. Lower number = higher priority (preferred +# keeper). Higher number = mark redundant. Tokens match case-insensitively as +# substrings of the full path. When a path matches multiple tokens the WORST +# (highest) number wins — so /photos/#recycle/MobileBackup/foo.jpg ranks as +# #recycle (10), not MobileBackup (1). +# +# Override at runtime by writing /data/folder_priority.json: +# {"priorities": {"my_folder": 5, "trash": 10}, "default": 2} +_FOLDER_PRIORITY_DEFAULTS = ( + ("#recycle", 10), + ("photoprism", 9), + ("photoprizm", 8), + ("photolibrary", 7), + ("albumsbackup", 6), + ("organized", 5), + ("moved", 4), + ("random", 3), + ("mobilebackup", 1), +) +_FOLDER_PRIORITY_DEFAULT_BUCKET = 2 # "anything else" + +_folder_priority_cache: tuple[tuple[tuple[str, int], ...], int] | None = None + + +def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]: + """Load folder priority list from /data/folder_priority.json if present, + else fall back to defaults. Cached after first call per process.""" + global _folder_priority_cache + if _folder_priority_cache is not None: + return _folder_priority_cache + entries: tuple[tuple[str, int], ...] = _FOLDER_PRIORITY_DEFAULTS + default_bucket = _FOLDER_PRIORITY_DEFAULT_BUCKET + try: + import json + path = "/data/folder_priority.json" + if os.path.exists(path): + with open(path) as f: + data = json.load(f) + entries = tuple( + (k.lower(), int(v)) + for k, v in (data.get("priorities") or {}).items() + ) + default_bucket = int(data.get("default", default_bucket)) + except Exception: + pass + _folder_priority_cache = (entries, default_bucket) + return _folder_priority_cache + + +def _folder_priority(path: str) -> int: + """Return the worst (highest) priority bucket matching this path, or default.""" + if not path: + entries, default_bucket = _load_folder_priority() + return default_bucket + entries, default_bucket = _load_folder_priority() + low = path.lower() + worst: int | None = None + for token, prio in entries: + if token in low and (worst is None or prio > worst): + worst = prio + return worst if worst is not None else default_bucket + + +# Generic copy/backup signal — applies on top of explicit folder priority as a +# tiebreaker. Tokens match as whole-word-ish substrings of each path segment. _DUP_FOLDER_TOKENS = ( "trash", "trashed", "dup", "dups", "duplicate", "duplicates", "backup", "backups", "copy", "copies", "old", "archive", "archived", @@ -401,26 +462,28 @@ def _path_penalty(path: str) -> int: def _suggested_keeper_by_resolution(members: list[dict]) -> int: """Return file_id of best keeper. - Ranking, in order: - 1. Highest pixel count (tie → largest file_size) - 2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting) - 3. Earliest mtime (originals are usually older than their copies) - 4. Earliest exif_datetime + Ranking, in order (lower wins): + 1. Folder priority bucket (explicit list, e.g. #recycle = worst) + 2. Highest pixel count (tie → largest file_size) + 3. Lowest path penalty (Trashed/, Dups/, Backup/, deep nesting) + 4. Earliest mtime (originals are usually older than their copies) + 5. Earliest exif_datetime """ def res_size(m): - return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0) - - top = max(res_size(m) for m in members) - tied = [m for m in members if res_size(m) == top] + # Negate for descending sort with min() + return (-(m["width"] or 0) * (m["height"] or 0), -(m["file_size"] or 0)) def rank(m): + path = m.get("path") or "" return ( - _path_penalty(m.get("path") or ""), + _folder_priority(path), + res_size(m), + _path_penalty(path), m.get("file_mtime") or "9999", m.get("exif_datetime") or "9999-99-99T99:99:99", ) - return min(tied, key=rank)["id"] + return min(members, key=rank)["id"] def _suggested_keeper_oldest(members: list[dict]) -> int: diff --git a/debian/build-deb.sh b/debian/build-deb.sh index 2a66680..4a4228b 100644 --- a/debian/build-deb.sh +++ b/debian/build-deb.sh @@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb" # ── Config ──────────────────────────────────────────────────────────────────── PKG_NAME="dupfinder" -PKG_VERSION="1.0.4" +PKG_VERSION="1.0.5" PKG_ARCH="amd64" DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"