Folder priority + path penalty: match folder segments only, not filenames

Both _folder_priority and _path_penalty were scanning the entire path
string including the basename. A file named 'mytrashed_pic.jpg' in
/photos/MobileBackup/ would falsely match the 'trash' token.

Now only directory segments are checked; filename never influences keeper
selection beyond its actual path location.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Carlos
2026-04-26 18:48:30 -04:00
parent 3128ddc593
commit 8b0fee0055
2 changed files with 28 additions and 15 deletions

View File

@@ -424,15 +424,21 @@ def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
def _folder_priority(path: str) -> int: def _folder_priority(path: str) -> int:
"""Return the worst (highest) priority bucket matching this path, or default.""" """Return the worst (highest) priority bucket matching any DIRECTORY segment
of this path, or default. Filename basename is intentionally excluded —
only folder names influence priority."""
entries, default_bucket = _load_folder_priority()
if not path: if not path:
entries, default_bucket = _load_folder_priority()
return default_bucket return default_bucket
entries, default_bucket = _load_folder_priority() # Split on /, drop empty segments, drop the last (filename basename).
low = path.lower() segments = [s.lower() for s in path.split("/") if s]
if len(segments) <= 1:
return default_bucket # no parent folder
dir_segments = segments[:-1]
worst: int | None = None worst: int | None = None
for seg in dir_segments:
for token, prio in entries: for token, prio in entries:
if token in low and (worst is None or prio > worst): if token in seg and (worst is None or prio > worst):
worst = prio worst = prio
return worst if worst is not None else default_bucket return worst if worst is not None else default_bucket
@@ -446,26 +452,33 @@ _DUP_FOLDER_TOKENS = (
def _path_penalty(path: str) -> int: def _path_penalty(path: str) -> int:
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups.""" """Higher = worse keeper candidate. Penalises FOLDERS (not filenames) that
look like copies/backups, plus repeated segments and very deep paths."""
if not path: if not path:
return 0 return 0
segments = [s for s in path.split("/") if s] segments = [s for s in path.split("/") if s]
if not segments:
return 0
# Folder segments only — exclude filename basename
dir_segments = segments[:-1]
score = 0 score = 0
for seg in segments: for seg in dir_segments:
low = seg.lower() low = seg.lower()
for tok in _DUP_FOLDER_TOKENS: for tok in _DUP_FOLDER_TOKENS:
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok): if (tok in low.split() or tok == low
or f"_{tok}" in low or f"{tok}_" in low
or low.startswith(tok) or low.endswith(tok)):
score += 100 score += 100
break break
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup # Repeated folder segments like "Desktop/Desktop/Files" suggest a nested backup
seen: set[str] = set() seen: set[str] = set()
for seg in segments: for seg in dir_segments:
low = seg.lower() low = seg.lower()
if low in seen: if low in seen:
score += 30 score += 30
seen.add(low) seen.add(low)
# Slight penalty for very deep paths (originals tend to live shallower) # Slight penalty for very deep paths (originals tend to live shallower)
score += max(0, len(segments) - 6) * 5 score += max(0, len(dir_segments) - 6) * 5
return score return score

2
debian/build-deb.sh vendored
View File

@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
# ── Config ──────────────────────────────────────────────────────────────────── # ── Config ────────────────────────────────────────────────────────────────────
PKG_NAME="dupfinder" PKG_NAME="dupfinder"
PKG_VERSION="1.0.10" PKG_VERSION="1.0.11"
PKG_ARCH="amd64" PKG_ARCH="amd64"
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb" DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"