Folder priority + path penalty: match folder segments only, not filenames

Both _folder_priority and _path_penalty were scanning the entire path
string including the basename. A file named 'mytrashed_pic.jpg' in
/photos/MobileBackup/ would falsely match the 'trash' token.

Now only directory segments are checked; filename never influences keeper
selection beyond its actual path location.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Carlos
2026-04-26 18:48:30 -04:00
parent 3128ddc593
commit 8b0fee0055
2 changed files with 28 additions and 15 deletions

View File

@@ -424,16 +424,22 @@ def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
def _folder_priority(path: str) -> int:
"""Return the worst (highest) priority bucket matching this path, or default."""
if not path:
entries, default_bucket = _load_folder_priority()
return default_bucket
"""Return the worst (highest) priority bucket matching any DIRECTORY segment
of this path, or default. Filename basename is intentionally excluded —
only folder names influence priority."""
entries, default_bucket = _load_folder_priority()
low = path.lower()
if not path:
return default_bucket
# Split on /, drop empty segments, drop the last (filename basename).
segments = [s.lower() for s in path.split("/") if s]
if len(segments) <= 1:
return default_bucket # no parent folder
dir_segments = segments[:-1]
worst: int | None = None
for token, prio in entries:
if token in low and (worst is None or prio > worst):
worst = prio
for seg in dir_segments:
for token, prio in entries:
if token in seg and (worst is None or prio > worst):
worst = prio
return worst if worst is not None else default_bucket
@@ -446,26 +452,33 @@ _DUP_FOLDER_TOKENS = (
def _path_penalty(path: str) -> int:
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
"""Higher = worse keeper candidate. Penalises FOLDERS (not filenames) that
look like copies/backups, plus repeated segments and very deep paths."""
if not path:
return 0
segments = [s for s in path.split("/") if s]
if not segments:
return 0
# Folder segments only — exclude filename basename
dir_segments = segments[:-1]
score = 0
for seg in segments:
for seg in dir_segments:
low = seg.lower()
for tok in _DUP_FOLDER_TOKENS:
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
if (tok in low.split() or tok == low
or f"_{tok}" in low or f"{tok}_" in low
or low.startswith(tok) or low.endswith(tok)):
score += 100
break
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
# Repeated folder segments like "Desktop/Desktop/Files" suggest a nested backup
seen: set[str] = set()
for seg in segments:
for seg in dir_segments:
low = seg.lower()
if low in seen:
score += 30
seen.add(low)
# Slight penalty for very deep paths (originals tend to live shallower)
score += max(0, len(segments) - 6) * 5
score += max(0, len(dir_segments) - 6) * 5
return score

2
debian/build-deb.sh vendored
View File

@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
# ── Config ────────────────────────────────────────────────────────────────────
PKG_NAME="dupfinder"
PKG_VERSION="1.0.10"
PKG_VERSION="1.0.11"
PKG_ARCH="amd64"
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"