Add explicit folder-priority ranking for keeper selection
#recycle (10) ranks worst, MobileBackup (1) best, default 2. Folder priority dominates resolution + path-penalty; mtime stays as final tiebreak. Override via /data/folder_priority.json (cached per process). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -364,10 +364,71 @@ class UnionFind:
|
|||||||
|
|
||||||
# ── Detection passes ──────────────────────────────────────────────────────────
|
# ── Detection passes ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
# Folder-name signals: any path segment containing one of these tokens
|
# Explicit folder-priority ranking. Lower number = higher priority (preferred
|
||||||
# is treated as evidence the file is a duplicate copy, not the canonical original.
|
# keeper). Higher number = mark redundant. Tokens match case-insensitively as
|
||||||
# Tokens are matched case-insensitively as substrings of each path segment, so
|
# substrings of the full path. When a path matches multiple tokens the WORST
|
||||||
# "Trashed", "trash_old", "MyDups" all match.
|
# (highest) number wins — so /photos/#recycle/MobileBackup/foo.jpg ranks as
|
||||||
|
# #recycle (10), not MobileBackup (1).
|
||||||
|
#
|
||||||
|
# Override at runtime by writing /data/folder_priority.json:
|
||||||
|
# {"priorities": {"my_folder": 5, "trash": 10}, "default": 2}
|
||||||
|
_FOLDER_PRIORITY_DEFAULTS = (
|
||||||
|
("#recycle", 10),
|
||||||
|
("photoprism", 9),
|
||||||
|
("photoprizm", 8),
|
||||||
|
("photolibrary", 7),
|
||||||
|
("albumsbackup", 6),
|
||||||
|
("organized", 5),
|
||||||
|
("moved", 4),
|
||||||
|
("random", 3),
|
||||||
|
("mobilebackup", 1),
|
||||||
|
)
|
||||||
|
_FOLDER_PRIORITY_DEFAULT_BUCKET = 2 # "anything else"
|
||||||
|
|
||||||
|
_folder_priority_cache: tuple[tuple[tuple[str, int], ...], int] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
|
||||||
|
"""Load folder priority list from /data/folder_priority.json if present,
|
||||||
|
else fall back to defaults. Cached after first call per process."""
|
||||||
|
global _folder_priority_cache
|
||||||
|
if _folder_priority_cache is not None:
|
||||||
|
return _folder_priority_cache
|
||||||
|
entries: tuple[tuple[str, int], ...] = _FOLDER_PRIORITY_DEFAULTS
|
||||||
|
default_bucket = _FOLDER_PRIORITY_DEFAULT_BUCKET
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
path = "/data/folder_priority.json"
|
||||||
|
if os.path.exists(path):
|
||||||
|
with open(path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
entries = tuple(
|
||||||
|
(k.lower(), int(v))
|
||||||
|
for k, v in (data.get("priorities") or {}).items()
|
||||||
|
)
|
||||||
|
default_bucket = int(data.get("default", default_bucket))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
_folder_priority_cache = (entries, default_bucket)
|
||||||
|
return _folder_priority_cache
|
||||||
|
|
||||||
|
|
||||||
|
def _folder_priority(path: str) -> int:
|
||||||
|
"""Return the worst (highest) priority bucket matching this path, or default."""
|
||||||
|
if not path:
|
||||||
|
entries, default_bucket = _load_folder_priority()
|
||||||
|
return default_bucket
|
||||||
|
entries, default_bucket = _load_folder_priority()
|
||||||
|
low = path.lower()
|
||||||
|
worst: int | None = None
|
||||||
|
for token, prio in entries:
|
||||||
|
if token in low and (worst is None or prio > worst):
|
||||||
|
worst = prio
|
||||||
|
return worst if worst is not None else default_bucket
|
||||||
|
|
||||||
|
|
||||||
|
# Generic copy/backup signal — applies on top of explicit folder priority as a
|
||||||
|
# tiebreaker. Tokens match as whole-word-ish substrings of each path segment.
|
||||||
_DUP_FOLDER_TOKENS = (
|
_DUP_FOLDER_TOKENS = (
|
||||||
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
|
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
|
||||||
"backup", "backups", "copy", "copies", "old", "archive", "archived",
|
"backup", "backups", "copy", "copies", "old", "archive", "archived",
|
||||||
@@ -401,26 +462,28 @@ def _path_penalty(path: str) -> int:
|
|||||||
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
||||||
"""Return file_id of best keeper.
|
"""Return file_id of best keeper.
|
||||||
|
|
||||||
Ranking, in order:
|
Ranking, in order (lower wins):
|
||||||
1. Highest pixel count (tie → largest file_size)
|
1. Folder priority bucket (explicit list, e.g. #recycle = worst)
|
||||||
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
|
2. Highest pixel count (tie → largest file_size)
|
||||||
3. Earliest mtime (originals are usually older than their copies)
|
3. Lowest path penalty (Trashed/, Dups/, Backup/, deep nesting)
|
||||||
4. Earliest exif_datetime
|
4. Earliest mtime (originals are usually older than their copies)
|
||||||
|
5. Earliest exif_datetime
|
||||||
"""
|
"""
|
||||||
def res_size(m):
|
def res_size(m):
|
||||||
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
|
# Negate for descending sort with min()
|
||||||
|
return (-(m["width"] or 0) * (m["height"] or 0), -(m["file_size"] or 0))
|
||||||
top = max(res_size(m) for m in members)
|
|
||||||
tied = [m for m in members if res_size(m) == top]
|
|
||||||
|
|
||||||
def rank(m):
|
def rank(m):
|
||||||
|
path = m.get("path") or ""
|
||||||
return (
|
return (
|
||||||
_path_penalty(m.get("path") or ""),
|
_folder_priority(path),
|
||||||
|
res_size(m),
|
||||||
|
_path_penalty(path),
|
||||||
m.get("file_mtime") or "9999",
|
m.get("file_mtime") or "9999",
|
||||||
m.get("exif_datetime") or "9999-99-99T99:99:99",
|
m.get("exif_datetime") or "9999-99-99T99:99:99",
|
||||||
)
|
)
|
||||||
|
|
||||||
return min(tied, key=rank)["id"]
|
return min(members, key=rank)["id"]
|
||||||
|
|
||||||
|
|
||||||
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
||||||
|
|||||||
2
debian/build-deb.sh
vendored
2
debian/build-deb.sh
vendored
@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
|
|||||||
|
|
||||||
# ── Config ────────────────────────────────────────────────────────────────────
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
PKG_NAME="dupfinder"
|
PKG_NAME="dupfinder"
|
||||||
PKG_VERSION="1.0.4"
|
PKG_VERSION="1.0.5"
|
||||||
PKG_ARCH="amd64"
|
PKG_ARCH="amd64"
|
||||||
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user