Smarter keeper selection: folder-name + mtime signals
Adds a path-penalty score that downranks files in folders named Trashed, Dups, Backup, Copy, Old, Archive, plus a penalty for repeated path segments (e.g. Desktop/Desktop/Files) and very deep paths. Also captures and uses file mtime as a tiebreaker — older files are usually the originals. Applied to all four detection passes (sha256, phash, exif, filesize+dim) and to auto-resolve-exact. New file_mtime column with idempotent migration. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -490,7 +490,8 @@ def auto_resolve_exact():
|
|||||||
|
|
||||||
for gid in groups:
|
for gid in groups:
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
|
SELECT f.id, f.path, f.width, f.height, f.file_size,
|
||||||
|
f.exif_datetime, f.file_mtime
|
||||||
FROM duplicate_members dm
|
FROM duplicate_members dm
|
||||||
JOIN files f ON f.id = dm.file_id
|
JOIN files f ON f.id = dm.file_id
|
||||||
WHERE dm.group_id = ?
|
WHERE dm.group_id = ?
|
||||||
|
|||||||
@@ -101,6 +101,7 @@ def init_db():
|
|||||||
exif_device TEXT,
|
exif_device TEXT,
|
||||||
width INTEGER,
|
width INTEGER,
|
||||||
height INTEGER,
|
height INTEGER,
|
||||||
|
file_mtime TEXT,
|
||||||
is_takeout INTEGER DEFAULT 0,
|
is_takeout INTEGER DEFAULT 0,
|
||||||
is_edited INTEGER DEFAULT 0,
|
is_edited INTEGER DEFAULT 0,
|
||||||
takeout_json TEXT,
|
takeout_json TEXT,
|
||||||
@@ -169,6 +170,12 @@ def init_db():
|
|||||||
cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}")
|
cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # column already exists
|
pass # column already exists
|
||||||
|
|
||||||
|
# Migration: file_mtime added in v1.0.3 for keeper-selection scoring
|
||||||
|
try:
|
||||||
|
cur.execute("ALTER TABLE files ADD COLUMN file_mtime TEXT")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
con.commit()
|
con.commit()
|
||||||
|
|
||||||
# ── Detect interrupted scans from previous run ────────────────────────────
|
# ── Detect interrupted scans from previous run ────────────────────────────
|
||||||
@@ -298,6 +305,7 @@ def extract_file(path: str) -> dict:
|
|||||||
"exif_device": None,
|
"exif_device": None,
|
||||||
"width": None,
|
"width": None,
|
||||||
"height": None,
|
"height": None,
|
||||||
|
"file_mtime": _mtime_str(path),
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -356,17 +364,63 @@ class UnionFind:
|
|||||||
|
|
||||||
# ── Detection passes ──────────────────────────────────────────────────────────
|
# ── Detection passes ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Folder-name signals: any path segment containing one of these tokens
|
||||||
|
# is treated as evidence the file is a duplicate copy, not the canonical original.
|
||||||
|
# Tokens are matched case-insensitively as substrings of each path segment, so
|
||||||
|
# "Trashed", "trash_old", "MyDups" all match.
|
||||||
|
_DUP_FOLDER_TOKENS = (
|
||||||
|
"trash", "trashed", "dup", "dups", "duplicate", "duplicates",
|
||||||
|
"backup", "backups", "copy", "copies", "old", "archive", "archived",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _path_penalty(path: str) -> int:
|
||||||
|
"""Higher = worse keeper candidate. Penalises paths that look like copies/backups."""
|
||||||
|
if not path:
|
||||||
|
return 0
|
||||||
|
segments = [s for s in path.split("/") if s]
|
||||||
|
score = 0
|
||||||
|
for seg in segments:
|
||||||
|
low = seg.lower()
|
||||||
|
for tok in _DUP_FOLDER_TOKENS:
|
||||||
|
if tok in low.split() or tok == low or f"_{tok}" in low or f"{tok}_" in low or low.startswith(tok) or low.endswith(tok):
|
||||||
|
score += 100
|
||||||
|
break
|
||||||
|
# Repeated segments like "Desktop/Desktop/Files" suggest a nested backup
|
||||||
|
seen: set[str] = set()
|
||||||
|
for seg in segments:
|
||||||
|
low = seg.lower()
|
||||||
|
if low in seen:
|
||||||
|
score += 30
|
||||||
|
seen.add(low)
|
||||||
|
# Slight penalty for very deep paths (originals tend to live shallower)
|
||||||
|
score += max(0, len(segments) - 6) * 5
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
||||||
"""Return file_id of best keeper: largest pixels, tie-break by file size,
|
"""Return file_id of best keeper.
|
||||||
final tie-break by oldest exif_datetime (likely the original)."""
|
|
||||||
|
Ranking, in order:
|
||||||
|
1. Highest pixel count (tie → largest file_size)
|
||||||
|
2. Lowest path penalty (avoid Trashed/, Dups/, Backup/, deep nesting)
|
||||||
|
3. Earliest mtime (originals are usually older than their copies)
|
||||||
|
4. Earliest exif_datetime
|
||||||
|
"""
|
||||||
def res_size(m):
|
def res_size(m):
|
||||||
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
|
return ((m["width"] or 0) * (m["height"] or 0), m["file_size"] or 0)
|
||||||
|
|
||||||
top = max(res_size(m) for m in members)
|
top = max(res_size(m) for m in members)
|
||||||
tied = [m for m in members if res_size(m) == top]
|
tied = [m for m in members if res_size(m) == top]
|
||||||
return min(
|
|
||||||
tied, key=lambda m: m.get("exif_datetime") or "9999-99-99T99:99:99"
|
def rank(m):
|
||||||
)["id"]
|
return (
|
||||||
|
_path_penalty(m.get("path") or ""),
|
||||||
|
m.get("file_mtime") or "9999",
|
||||||
|
m.get("exif_datetime") or "9999-99-99T99:99:99",
|
||||||
|
)
|
||||||
|
|
||||||
|
return min(tied, key=rank)["id"]
|
||||||
|
|
||||||
|
|
||||||
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
||||||
@@ -388,7 +442,7 @@ def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
|
|||||||
for row in rows:
|
for row in rows:
|
||||||
sha = row["sha256"]
|
sha = row["sha256"]
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
SELECT id, width, height, file_size, exif_datetime
|
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
|
||||||
FROM files WHERE sha256 = ?
|
FROM files WHERE sha256 = ?
|
||||||
""", (sha,))
|
""", (sha,))
|
||||||
members = [dict(r) for r in cur.fetchall()]
|
members = [dict(r) for r in cur.fetchall()]
|
||||||
@@ -411,7 +465,8 @@ def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
|
|||||||
cur = con.cursor()
|
cur = con.cursor()
|
||||||
# Exclude files already in sha256 groups
|
# Exclude files already in sha256 groups
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
|
SELECT f.id, f.path, f.phash, f.width, f.height, f.file_size,
|
||||||
|
f.exif_datetime, f.file_mtime
|
||||||
FROM files f
|
FROM files f
|
||||||
WHERE f.phash IS NOT NULL
|
WHERE f.phash IS NOT NULL
|
||||||
AND length(f.phash) = 16
|
AND length(f.phash) = 16
|
||||||
@@ -508,7 +563,7 @@ def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
|
|||||||
for row in rows:
|
for row in rows:
|
||||||
dt, dev = row["exif_datetime"], row["exif_device"]
|
dt, dev = row["exif_datetime"], row["exif_device"]
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
SELECT id, width, height, file_size, exif_datetime
|
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
|
||||||
FROM files
|
FROM files
|
||||||
WHERE exif_datetime = ? AND exif_device = ?
|
WHERE exif_datetime = ? AND exif_device = ?
|
||||||
""", (dt, dev))
|
""", (dt, dev))
|
||||||
@@ -547,12 +602,13 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
|
|||||||
for row in rows:
|
for row in rows:
|
||||||
fs, w, h = row["file_size"], row["width"], row["height"]
|
fs, w, h = row["file_size"], row["width"], row["height"]
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
SELECT id, width, height, file_size, exif_datetime
|
SELECT id, path, width, height, file_size, exif_datetime, file_mtime
|
||||||
FROM files
|
FROM files
|
||||||
WHERE file_size = ? AND width = ? AND height = ?
|
WHERE file_size = ? AND width = ? AND height = ?
|
||||||
""", (fs, w, h))
|
""", (fs, w, h))
|
||||||
members = [dict(r) for r in cur.fetchall()]
|
members = [dict(r) for r in cur.fetchall()]
|
||||||
keeper_id = _suggested_keeper_oldest(members)
|
# Filesize+dim is the weakest signal — folder/mtime tiebreak helps a lot here
|
||||||
|
keeper_id = _suggested_keeper_by_resolution(members)
|
||||||
method_value = f"{fs}::{w}x{h}"
|
method_value = f"{fs}::{w}x{h}"
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
|
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
|
||||||
@@ -677,7 +733,8 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
|||||||
file_size=:file_size, mime_type=:mime_type,
|
file_size=:file_size, mime_type=:mime_type,
|
||||||
sha256=:sha256, exif_datetime=:exif_datetime,
|
sha256=:sha256, exif_datetime=:exif_datetime,
|
||||||
exif_device=:exif_device, width=:width,
|
exif_device=:exif_device, width=:width,
|
||||||
height=:height, scan_id=:scan_id,
|
height=:height, file_mtime=:file_mtime,
|
||||||
|
scan_id=:scan_id,
|
||||||
status='pending', updated_at=CURRENT_TIMESTAMP
|
status='pending', updated_at=CURRENT_TIMESTAMP
|
||||||
WHERE path=:path
|
WHERE path=:path
|
||||||
""", record)
|
""", record)
|
||||||
@@ -686,12 +743,12 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
|||||||
INSERT OR IGNORE INTO files
|
INSERT OR IGNORE INTO files
|
||||||
(path, filename, extension, file_size, mime_type,
|
(path, filename, extension, file_size, mime_type,
|
||||||
sha256, exif_datetime, exif_device, width,
|
sha256, exif_datetime, exif_device, width,
|
||||||
height, scan_id, status)
|
height, file_mtime, scan_id, status)
|
||||||
VALUES
|
VALUES
|
||||||
(:path, :filename, :extension, :file_size,
|
(:path, :filename, :extension, :file_size,
|
||||||
:mime_type, :sha256, :exif_datetime,
|
:mime_type, :sha256, :exif_datetime,
|
||||||
:exif_device, :width, :height, :scan_id,
|
:exif_device, :width, :height, :file_mtime,
|
||||||
'pending')
|
:scan_id, 'pending')
|
||||||
""", record)
|
""", record)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=N_WORKERS) as pool:
|
with ThreadPoolExecutor(max_workers=N_WORKERS) as pool:
|
||||||
|
|||||||
2
debian/build-deb.sh
vendored
2
debian/build-deb.sh
vendored
@@ -13,7 +13,7 @@ BUILD_DIR="$REPO_ROOT/build/deb"
|
|||||||
|
|
||||||
# ── Config ────────────────────────────────────────────────────────────────────
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
PKG_NAME="dupfinder"
|
PKG_NAME="dupfinder"
|
||||||
PKG_VERSION="1.0.2"
|
PKG_VERSION="1.0.3"
|
||||||
PKG_ARCH="amd64"
|
PKG_ARCH="amd64"
|
||||||
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
DEB_FILE="${PKG_NAME}_${PKG_VERSION}_${PKG_ARCH}.deb"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user