Folder priority + path penalty: match folder segments only, not filenames
Both _folder_priority and _path_penalty were scanning the entire path string including the basename. A file named 'mytrashed_pic.jpg' in /photos/MobileBackup/ would falsely match the 'trash' token. Now only directory segments are checked; filename never influences keeper selection beyond its actual path location. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -424,16 +424,22 @@ def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
|
||||
|
||||
|
||||
def _folder_priority(path: str) -> int:
|
||||
"""Return the worst (highest) priority bucket matching this path, or default."""
|
||||
if not path:
|
||||
entries, default_bucket = _load_folder_priority()
|
||||
return default_bucket
|
||||
"""Return the worst (highest) priority bucket matching any DIRECTORY segment
|
||||
of this path, or default. Filename basename is intentionally excluded —
|
||||
only folder names influence priority."""
|
||||
entries, default_bucket = _load_folder_priority()
|
||||
low = path.lower()
|
||||
if not path:
|
||||
return default_bucket
|
||||
# Split on /, drop empty segments, drop the last (filename basename).
|
||||
segments = [s.lower() for s in path.split("/") if s]
|
||||
if len(segments) <= 1:
|
||||
return default_bucket # no parent folder
|
||||
dir_segments = segments[:-1]
|
||||
worst: int | None = None
|
||||
for token, prio in entries:
|
||||
if token in low and (worst is None or prio > worst):
|
||||
worst = prio
|
||||
for seg in dir_segments:
|
||||
for token, prio in entries:
|
||||
if token in seg and (worst is None or prio > worst):
|
||||
worst = prio
|
||||
return worst if worst is not None else default_bucket
|
||||
|
||||
|
||||
@@ -446,26 +452,33 @@ _DUP_FOLDER_TOKENS = (
|
||||
|
||||
|
||||
def _path_penalty(path: str) -> int:
|
||||
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
|
||||
"""Higher = worse keeper candidate. Penalises FOLDERS (not filenames) that
|
||||
look like copies/backups, plus repeated segments and very deep paths."""
|
||||
if not path:
|
||||
return 0
|
||||
segments = [s for s in path.split("/") if s]
|
||||
if not segments:
|
||||
return 0
|
||||
# Folder segments only — exclude filename basename
|
||||
dir_segments = segments[:-1]
|
||||
score = 0
|
||||
for seg in segments:
|
||||
for seg in dir_segments:
|
||||
low = seg.lower()
|
||||
for tok in _DUP_FOLDER_TOKENS:
|
||||
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
|
||||
if (tok in low.split() or tok == low
|
||||
or f"_{tok}" in low or f"{tok}_" in low
|
||||
or low.startswith(tok) or low.endswith(tok)):
|
||||
score += 100
|
||||
break
|
||||
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
|
||||
# Repeated folder segments like "Desktop/Desktop/Files" suggest a nested backup
|
||||
seen: set[str] = set()
|
||||
for seg in segments:
|
||||
for seg in dir_segments:
|
||||
low = seg.lower()
|
||||
if low in seen:
|
||||
score += 30
|
||||
seen.add(low)
|
||||
# Slight penalty for very deep paths (originals tend to live shallower)
|
||||
score += max(0, len(segments) - 6) * 5
|
||||
score += max(0, len(dir_segments) - 6) * 5
|
||||
return score
|
||||
|
||||
|
||||
|
||||
2
debian/build-deb.sh
vendored
2
debian/build-deb.sh
vendored
@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
PKG_NAME="dupfinder"
|
||||
PKG_VERSION="1.0.10"
|
||||
PKG_VERSION="1.0.11"
|
||||
PKG_ARCH="amd64"
|
||||
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user