From 8b0fee0055a0974ef8fd80058338168ee507fd50 Mon Sep 17 00:00:00 2001 From: Carlos Date: Sun, 26 Apr 2026 18:48:30 -0400 Subject: [PATCH] Folder priority + path penalty: match folder segments only, not filenames Both _folder_priority and _path_penalty were scanning the entire path string including the basename. A file named 'mytrashed_pic.jpg' in /photos/MobileBackup/ would falsely match the 'trash' token. Now only directory segments are checked; filename never influences keeper selection beyond its actual path location. Co-Authored-By: Claude Opus 4.7 --- app/scanner.py | 41 +++++++++++++++++++++++++++-------------- debian/build-deb.sh | 2 +- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/app/scanner.py b/app/scanner.py index 7486e27..24f7d9a 100644 --- a/app/scanner.py +++ b/app/scanner.py @@ -424,16 +424,22 @@ def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]: def _folder_priority(path: str) -> int: - """Return the worst (highest) priority bucket matching this path, or default.""" - if not path: - entries, default_bucket = _load_folder_priority() - return default_bucket + """Return the worst (highest) priority bucket matching any DIRECTORY segment + of this path, or default. Filename basename is intentionally excluded — + only folder names influence priority.""" entries, default_bucket = _load_folder_priority() - low = path.lower() + if not path: + return default_bucket + # Split on /, drop empty segments, drop the last (filename basename). + segments = [s.lower() for s in path.split("/") if s] + if len(segments) <= 1: + return default_bucket # no parent folder + dir_segments = segments[:-1] worst: int | None = None - for token, prio in entries: - if token in low and (worst is None or prio > worst): - worst = prio + for seg in dir_segments: + for token, prio in entries: + if token in seg and (worst is None or prio > worst): + worst = prio return worst if worst is not None else default_bucket @@ -446,26 +452,33 @@ _DUP_FOLDER_TOKENS = ( def _path_penalty(path: str) -> int: - """Higher = worse keeper candidate. Penalises paths that look like copies/backups.""" + """Higher = worse keeper candidate. Penalises FOLDERS (not filenames) that + look like copies/backups, plus repeated segments and very deep paths.""" if not path: return 0 segments = [s for s in path.split("/") if s] + if not segments: + return 0 + # Folder segments only — exclude filename basename + dir_segments = segments[:-1] score = 0 - for seg in segments: + for seg in dir_segments: low = seg.lower() for tok in _DUP_FOLDER_TOKENS: - if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok): + if (tok in low.split() or tok == low + or f"_{tok}" in low or f"{tok}_" in low + or low.startswith(tok) or low.endswith(tok)): score += 100 break - # Repeated segments like "Desktop/Desktop/Files" suggest a nested backup + # Repeated folder segments like "Desktop/Desktop/Files" suggest a nested backup seen: set[str] = set() - for seg in segments: + for seg in dir_segments: low = seg.lower() if low in seen: score += 30 seen.add(low) # Slight penalty for very deep paths (originals tend to live shallower) - score += max(0, len(segments) - 6) * 5 + score += max(0, len(dir_segments) - 6) * 5 return score diff --git a/debian/build-deb.sh b/debian/build-deb.sh index aeed470..3c739a6 100644 --- a/debian/build-deb.sh +++ b/debian/build-deb.sh @@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb" # ── Config ──────────────────────────────────────────────────────────────────── PKG_NAME="dupfinder" -PKG_VERSION="1.0.10" +PKG_VERSION="1.0.11" PKG_ARCH="amd64" DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"