Folder priority + path penalty: match folder segments only, not filenames
Both _folder_priority and _path_penalty were scanning the entire path string including the basename. A file named 'mytrashed_pic.jpg' in /photos/MobileBackup/ would falsely match the 'trash' token. Now only directory segments are checked; filename never influences keeper selection beyond its actual path location. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -424,16 +424,22 @@ def _load_folder_priority() -> tuple[tuple[tuple[str, int], ...], int]:
|
|||||||
|
|
||||||
|
|
||||||
def _folder_priority(path: str) -> int:
|
def _folder_priority(path: str) -> int:
|
||||||
"""Return the worst (highest) priority bucket matching this path, or default."""
|
"""Return the worst (highest) priority bucket matching any DIRECTORY segment
|
||||||
if not path:
|
of this path, or default. Filename basename is intentionally excluded —
|
||||||
entries, default_bucket = _load_folder_priority()
|
only folder names influence priority."""
|
||||||
return default_bucket
|
|
||||||
entries, default_bucket = _load_folder_priority()
|
entries, default_bucket = _load_folder_priority()
|
||||||
low = path.lower()
|
if not path:
|
||||||
|
return default_bucket
|
||||||
|
# Split on /, drop empty segments, drop the last (filename basename).
|
||||||
|
segments = [s.lower() for s in path.split("/") if s]
|
||||||
|
if len(segments) <= 1:
|
||||||
|
return default_bucket # no parent folder
|
||||||
|
dir_segments = segments[:-1]
|
||||||
worst: int | None = None
|
worst: int | None = None
|
||||||
for token, prio in entries:
|
for seg in dir_segments:
|
||||||
if token in low and (worst is None or prio > worst):
|
for token, prio in entries:
|
||||||
worst = prio
|
if token in seg and (worst is None or prio > worst):
|
||||||
|
worst = prio
|
||||||
return worst if worst is not None else default_bucket
|
return worst if worst is not None else default_bucket
|
||||||
|
|
||||||
|
|
||||||
@@ -446,26 +452,33 @@ _DUP_FOLDER_TOKENS = (
|
|||||||
|
|
||||||
|
|
||||||
def _path_penalty(path: str) -> int:
|
def _path_penalty(path: str) -> int:
|
||||||
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
|
"""Higher = worse keeper candidate. Penalises FOLDERS (not filenames) that
|
||||||
|
look like copies/backups, plus repeated segments and very deep paths."""
|
||||||
if not path:
|
if not path:
|
||||||
return 0
|
return 0
|
||||||
segments = [s for s in path.split("/") if s]
|
segments = [s for s in path.split("/") if s]
|
||||||
|
if not segments:
|
||||||
|
return 0
|
||||||
|
# Folder segments only — exclude filename basename
|
||||||
|
dir_segments = segments[:-1]
|
||||||
score = 0
|
score = 0
|
||||||
for seg in segments:
|
for seg in dir_segments:
|
||||||
low = seg.lower()
|
low = seg.lower()
|
||||||
for tok in _DUP_FOLDER_TOKENS:
|
for tok in _DUP_FOLDER_TOKENS:
|
||||||
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
|
if (tok in low.split() or tok == low
|
||||||
|
or f"_{tok}" in low or f"{tok}_" in low
|
||||||
|
or low.startswith(tok) or low.endswith(tok)):
|
||||||
score += 100
|
score += 100
|
||||||
break
|
break
|
||||||
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
|
# Repeated folder segments like "Desktop/Desktop/Files" suggest a nested backup
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
for seg in segments:
|
for seg in dir_segments:
|
||||||
low = seg.lower()
|
low = seg.lower()
|
||||||
if low in seen:
|
if low in seen:
|
||||||
score += 30
|
score += 30
|
||||||
seen.add(low)
|
seen.add(low)
|
||||||
# Slight penalty for very deep paths (originals tend to live shallower)
|
# Slight penalty for very deep paths (originals tend to live shallower)
|
||||||
score += max(0, len(segments) - 6) * 5
|
score += max(0, len(dir_segments) - 6) * 5
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
2
debian/build-deb.sh
vendored
2
debian/build-deb.sh
vendored
@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
|
|||||||
|
|
||||||
# ── Config ────────────────────────────────────────────────────────────────────
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
PKG_NAME="dupfinder"
|
PKG_NAME="dupfinder"
|
||||||
PKG_VERSION="1.0.10"
|
PKG_VERSION="1.0.11"
|
||||||
PKG_ARCH="amd64"
|
PKG_ARCH="amd64"
|
||||||
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user