Add explicit folder-priority ranking for keeper selection
#recycle (10) ranks worst, MobileBackup (1) best, default 2. Folder priority dominates resolution + path-penalty; mtime stays as final tiebreak. Override via /data/folder_priority.json (cached per process). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -364,10 +364,71 @@ class UnionFind:
|
||||
|
||||
# ── Detection passes ──────────────────────────────────────────────────────────
|
||||
|
||||
# Folder-name signals: any path segment containing one of these tokens
|
||||
# is treated as evidence the file is a duplicate copy, not the canonical original.
|
||||
# Tokens are matched case-insensitively as substrings of each path segment, so
|
||||
# "Trashed", "trash_old", "MyDups" all match.
|
||||
# Explicit folder-priority ranking. Lower number = higher priority (preferred
|
||||
# keeper). Higher number = mark redundant. Tokens match case-insensitively as
|
||||
# substrings of the full path. When a path matches multiple tokens the WORST
|
||||
# (highest) number wins — so /photos/#recycle/MobileBackup/foo.jpg ranks as
|
||||
# #recycle (10), not MobileBackup (1).
|
||||
#
|
||||
# Override at runtime by writing /data/folder_priority.json:
|
||||
# {"priorities": {"my_folder": 5, "trash": 10}, "default": 2}
|
||||
_FOLDER_PRIORITY_DEFAULTS = (
|
||||
("#recycle", 10),
|
||||
("photoprism", 9),
|
||||
("photoprizm", 8),
|
||||
("photolibrary", 7),
|
||||
("albumsbackup", 6),
|
||||
("organized", 5),
|
||||
("moved", 4),
|
||||
("random", 3),
|
||||
("mobilebackup", 1),
|
||||
)
|
||||
_FOLDER_PRIORITY_DEFAULT_BUCKET = 2 # "anything else"
|
||||
|
||||
_folder_priority_cache: tuple[tuple[tuple[str, int], ...], int] | None = None
|
||||
|
||||
|
||||
def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
|
||||
"""Load folder priority list from /data/folder_priority.json if present,
|
||||
else fall back to defaults. Cached after first call per process."""
|
||||
global _folder_priority_cache
|
||||
if _folder_priority_cache is not None:
|
||||
return _folder_priority_cache
|
||||
entries: tuple[tuple[str, int], ...] = _FOLDER_PRIORITY_DEFAULTS
|
||||
default_bucket = _FOLDER_PRIORITY_DEFAULT_BUCKET
|
||||
try:
|
||||
import json
|
||||
path = "/data/folder_priority.json"
|
||||
if os.path.exists(path):
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
entries = tuple(
|
||||
(k.lower(), int(v))
|
||||
for k, v in (data.get("priorities") or {}).items()
|
||||
)
|
||||
default_bucket = int(data.get("default", default_bucket))
|
||||
except Exception:
|
||||
pass
|
||||
_folder_priority_cache = (entries, default_bucket)
|
||||
return _folder_priority_cache
|
||||
|
||||
|
||||
def _folder_priority(path: str) -> int:
|
||||
"""Return the worst (highest) priority bucket matching this path, or default."""
|
||||
if not path:
|
||||
entries, default_bucket = _load_folder_priority()
|
||||
return default_bucket
|
||||
entries, default_bucket = _load_folder_priority()
|
||||
low = path.lower()
|
||||
worst: int | None = None
|
||||
for token, prio in entries:
|
||||
if token in low and (worst is None or prio > worst):
|
||||
worst = prio
|
||||
return worst if worst is not None else default_bucket
|
||||
|
||||
|
||||
# Generic copy/backup signal — applies on top of explicit folder priority as a
|
||||
# tiebreaker. Tokens match as whole-word-ish substrings of each path segment.
|
||||
_DUP_FOLDER_TOKENS = (
|
||||
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
|
||||
"backup", "backups", "copy", "copies", "old", "archive", "archived",
|
||||
@@ -401,26 +462,28 @@ def _path_penalty(path: str) -> int:
|
||||
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
||||
"""Return file_id of best keeper.
|
||||
|
||||
Ranking, in order:
|
||||
1. Highest pixel count (tie → largest file_size)
|
||||
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
|
||||
3. Earliest mtime (originals are usually older than their copies)
|
||||
4. Earliest exif_datetime
|
||||
Ranking, in order (lower wins):
|
||||
1. Folder priority bucket (explicit list, e.g. #recycle = worst)
|
||||
2. Highest pixel count (tie → largest file_size)
|
||||
3. Lowest path penalty (Trashed/, Dups/, Backup/, deep nesting)
|
||||
4. Earliest mtime (originals are usually older than their copies)
|
||||
5. Earliest exif_datetime
|
||||
"""
|
||||
def res_size(m):
|
||||
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
|
||||
|
||||
top = max(res_size(m) for m in members)
|
||||
tied = [m for m in members if res_size(m) == top]
|
||||
# Negate for descending sort with min()
|
||||
return (-(m["width"] or 0) * (m["height"] or 0), -(m["file_size"] or 0))
|
||||
|
||||
def rank(m):
|
||||
path = m.get("path") or ""
|
||||
return (
|
||||
_path_penalty(m.get("path") or ""),
|
||||
_folder_priority(path),
|
||||
res_size(m),
|
||||
_path_penalty(path),
|
||||
m.get("file_mtime") or "9999",
|
||||
m.get("exif_datetime") or "9999-99-99T99:99:99",
|
||||
)
|
||||
|
||||
return min(tied, key=rank)["id"]
|
||||
return min(members, key=rank)["id"]
|
||||
|
||||
|
||||
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
||||
|
||||
2
debian/build-deb.sh
vendored
2
debian/build-deb.sh
vendored
@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
PKG_NAME="dupfinder"
|
||||
PKG_VERSION="1.0.4"
|
||||
PKG_VERSION="1.0.5"
|
||||
PKG_ARCH="amd64"
|
||||
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user