Add explicit folder-priority ranking for keeper selection

#recycle (10) ranks worst, MobileBackup (1) best, default 2.
Folder priority dominates resolution + path-penalty; mtime stays as final
tiebreak. Override via /data/folder_priority.json (cached per process).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Carlos
2026-04-26 15:52:04 -04:00
parent d95bf69be0
commit 399a80cb70
2 changed files with 79 additions and 16 deletions

View File

@@ -364,10 +364,71 @@ class UnionFind:
# ── Detection passes ──────────────────────────────────────────────────────────
# Folder-name signals: any path segment containing one of these tokens
# is treated as evidence the file is a duplicate copy, not the canonical original.
# Tokens are matched case-insensitively as substrings of each path segment, so
# "Trashed", "trash_old", "MyDups" all match.
# Explicit folder-priority ranking. Lower number = higher priority (preferred
# keeper). Higher number = mark redundant. Tokens match case-insensitively as
# substrings of the full path. When a path matches multiple tokens the WORST
# (highest) number wins — so /photos/#recycle/MobileBackup/foo.jpg ranks as
# #recycle (10), not MobileBackup (1).
#
# Override at runtime by writing /data/folder_priority.json:
# {"priorities": {"my_folder": 5, "trash": 10}, "default": 2}
_FOLDER_PRIORITY_DEFAULTS = (
("#recycle", 10),
("photoprism", 9),
("photoprizm", 8),
("photolibrary", 7),
("albumsbackup", 6),
("organized", 5),
("moved", 4),
("random", 3),
("mobilebackup", 1),
)
_FOLDER_PRIORITY_DEFAULT_BUCKET = 2 # "anything else"
_folder_priority_cache: tuple[tuple[tuple[str, int], ...], int] | None = None
def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
"""Load folder priority list from /data/folder_priority.json if present,
else fall back to defaults. Cached after first call per process."""
global _folder_priority_cache
if _folder_priority_cache is not None:
return _folder_priority_cache
entries: tuple[tuple[str, int], ...] = _FOLDER_PRIORITY_DEFAULTS
default_bucket = _FOLDER_PRIORITY_DEFAULT_BUCKET
try:
import json
path = "/data/folder_priority.json"
if os.path.exists(path):
with open(path) as f:
data = json.load(f)
entries = tuple(
(k.lower(), int(v))
for k, v in (data.get("priorities") or {}).items()
)
default_bucket = int(data.get("default", default_bucket))
except Exception:
pass
_folder_priority_cache = (entries, default_bucket)
return _folder_priority_cache
def _folder_priority(path: str) -> int:
"""Return the worst (highest) priority bucket matching this path, or default."""
if not path:
entries, default_bucket = _load_folder_priority()
return default_bucket
entries, default_bucket = _load_folder_priority()
low = path.lower()
worst: int | None = None
for token, prio in entries:
if token in low and (worst is None or prio > worst):
worst = prio
return worst if worst is not None else default_bucket
# Generic copy/backup signal — applies on top of explicit folder priority as a
# tiebreaker. Tokens match as whole-word-ish substrings of each path segment.
_DUP_FOLDER_TOKENS = (
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
"backup", "backups", "copy", "copies", "old", "archive", "archived",
@@ -401,26 +462,28 @@ def _path_penalty(path: str) -> int:
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
"""Return file_id of best keeper.
Ranking, in order:
1. Highest pixel count (tie → largest file_size)
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
3. Earliest mtime (originals are usually older than their copies)
4. Earliest exif_datetime
Ranking, in order (lower wins):
1. Folder priority bucket (explicit list, e.g. #recycle = worst)
2. Highest pixel count (tie → largest file_size)
3. Lowest path penalty (Trashed/, Dups/, Backup/, deep nesting)
4. Earliest mtime (originals are usually older than their copies)
5. Earliest exif_datetime
"""
def res_size(m):
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
top = max(res_size(m) for m in members)
tied = [m for m in members if res_size(m) == top]
# Negate for descending sort with min()
return (-(m["width"] or 0) * (m["height"] or 0), -(m["file_size"] or 0))
def rank(m):
path = m.get("path") or ""
return (
_path_penalty(m.get("path") or ""),
_folder_priority(path),
res_size(m),
_path_penalty(path),
m.get("file_mtime") or "9999",
m.get("exif_datetime") or "9999-99-99T99:99:99",
)
return min(tied, key=rank)["id"]
return min(members, key=rank)["id"]
def _suggested_keeper_oldest(members: list[dict]) -> int:

2
debian/build-deb.sh vendored
View File

@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
# ── Config ────────────────────────────────────────────────────────────────────
PKG_NAME="dupfinder"
PKG_VERSION="1.0.4"
PKG_VERSION="1.0.5"
PKG_ARCH="amd64"
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"