Initial implementation of duplicate finder
Full project per spec: FastAPI backend, 4-method duplicate detection (SHA-256, phash, EXIF, filesize), Google Takeout pre-processor, 4 scan modes, and dark-theme vanilla JS gallery frontend. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
758
app/scanner.py
Normal file
758
app/scanner.py
Normal file
@@ -0,0 +1,758 @@
|
||||
"""
|
||||
File scanner: discovery, per-file extraction, and all 4 duplicate detection passes.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import imagehash
|
||||
from PIL import Image, ExifTags, UnidentifiedImageError
|
||||
|
||||
try:
|
||||
from pillow_heif import register_heif_opener
|
||||
register_heif_opener()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from takeout import is_takeout_folder, process_takeout
|
||||
|
||||
|
||||
PHOTO_EXT = {
|
||||
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif",
|
||||
".webp", ".heic", ".heif", ".raw", ".cr2", ".nef", ".arw",
|
||||
".dng", ".orf", ".rw2", ".pef", ".srw", ".x3f",
|
||||
}
|
||||
|
||||
VIDEO_EXT = {
|
||||
".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp",
|
||||
".wmv", ".mts", ".m2ts",
|
||||
}
|
||||
|
||||
SUPPORTED_EXT = PHOTO_EXT | VIDEO_EXT
|
||||
|
||||
DB_PATH = "/data/dupfinder.db"
|
||||
|
||||
# Shared scan state (updated by background thread, read by status endpoint)
|
||||
scan_state = {
|
||||
"scan_id": None,
|
||||
"status": "idle", # idle | running | complete | error | cancelled
|
||||
"phase": "idle", # discovery | takeout | indexing | phash | grouping | done
|
||||
"progress": 0,
|
||||
"total": 0,
|
||||
"message": "",
|
||||
"cancel_requested": False,
|
||||
"stats": {},
|
||||
}
|
||||
|
||||
|
||||
# ── DB helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_db() -> sqlite3.Connection:
|
||||
con = sqlite3.connect(DB_PATH, timeout=30)
|
||||
con.row_factory = sqlite3.Row
|
||||
con.execute("PRAGMA journal_mode=WAL")
|
||||
con.execute("PRAGMA foreign_keys=ON")
|
||||
return con
|
||||
|
||||
|
||||
def init_db():
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
path TEXT UNIQUE NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
extension TEXT,
|
||||
file_size INTEGER,
|
||||
mime_type TEXT,
|
||||
sha256 TEXT,
|
||||
phash TEXT,
|
||||
exif_datetime TEXT,
|
||||
exif_device TEXT,
|
||||
width INTEGER,
|
||||
height INTEGER,
|
||||
is_takeout INTEGER DEFAULT 0,
|
||||
is_edited INTEGER DEFAULT 0,
|
||||
takeout_json TEXT,
|
||||
scan_id INTEGER,
|
||||
status TEXT DEFAULT 'pending',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS scans (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
folder_path TEXT NOT NULL,
|
||||
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
total_files INTEGER DEFAULT 0,
|
||||
status TEXT DEFAULT 'running'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS duplicate_groups (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
method TEXT NOT NULL,
|
||||
method_value TEXT,
|
||||
reviewed INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS duplicate_members (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
group_id INTEGER REFERENCES duplicate_groups(id) ON DELETE CASCADE,
|
||||
file_id INTEGER REFERENCES files(id) ON DELETE CASCADE,
|
||||
is_keeper INTEGER DEFAULT 0,
|
||||
suggested INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sha256 ON files(sha256);
|
||||
CREATE INDEX IF NOT EXISTS idx_phash ON files(phash);
|
||||
CREATE INDEX IF NOT EXISTS idx_exif_dt ON files(exif_datetime, exif_device);
|
||||
CREATE INDEX IF NOT EXISTS idx_size_dim ON files(file_size, width, height);
|
||||
CREATE INDEX IF NOT EXISTS idx_status ON files(status);
|
||||
""")
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
|
||||
# ── Per-file extraction ───────────────────────────────────────────────────────
|
||||
|
||||
def _sha256(path: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _exif_data(path: str) -> tuple[str | None, str | None]:
|
||||
"""Returns (exif_datetime, exif_device) or (None, None)."""
|
||||
try:
|
||||
img = Image.open(path)
|
||||
exif_raw = img._getexif()
|
||||
if not exif_raw:
|
||||
return None, None
|
||||
exif = {ExifTags.TAGS.get(k, k): v for k, v in exif_raw.items()}
|
||||
dt = exif.get("DateTimeOriginal") or exif.get("DateTime")
|
||||
if dt:
|
||||
try:
|
||||
dt = datetime.strptime(dt, "%Y:%m:%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
except ValueError:
|
||||
dt = None
|
||||
make = str(exif.get("Make", "")).strip()
|
||||
model = str(exif.get("Model", "")).strip()
|
||||
device = (make + " " + model).strip() if (make or model) else None
|
||||
return dt, device
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def _image_dims(path: str) -> tuple[int | None, int | None]:
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
return img.size # (width, height)
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def _phash(path: str) -> str | None:
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _video_dims(path: str) -> tuple[int | None, int | None]:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"ffprobe", "-v", "error",
|
||||
"-select_streams", "v:0",
|
||||
"-show_entries", "stream=width,height",
|
||||
"-of", "csv=p=0",
|
||||
path,
|
||||
],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
parts = result.stdout.strip().split(",")
|
||||
if len(parts) == 2:
|
||||
return int(parts[0]), int(parts[1])
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
|
||||
def _mtime_str(path: str) -> str | None:
|
||||
try:
|
||||
ts = os.path.getmtime(path)
|
||||
return datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_file(path: str) -> dict:
|
||||
ext = Path(path).suffix.lower()
|
||||
filename = Path(path).name
|
||||
is_photo = ext in PHOTO_EXT
|
||||
is_video = ext in VIDEO_EXT
|
||||
|
||||
record = {
|
||||
"path": path,
|
||||
"filename": filename,
|
||||
"extension": ext,
|
||||
"file_size": None,
|
||||
"mime_type": None,
|
||||
"sha256": None,
|
||||
"phash": None,
|
||||
"exif_datetime": None,
|
||||
"exif_device": None,
|
||||
"width": None,
|
||||
"height": None,
|
||||
}
|
||||
|
||||
try:
|
||||
record["file_size"] = os.path.getsize(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
record["mime_type"] = mimetypes.guess_type(path)[0]
|
||||
|
||||
try:
|
||||
record["sha256"] = _sha256(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if is_photo:
|
||||
w, h = _image_dims(path)
|
||||
record["width"], record["height"] = w, h
|
||||
dt, device = _exif_data(path)
|
||||
record["exif_datetime"] = dt or _mtime_str(path)
|
||||
record["exif_device"] = device
|
||||
# phash computed in separate phase for progress reporting
|
||||
|
||||
elif is_video:
|
||||
w, h = _video_dims(path)
|
||||
record["width"], record["height"] = w, h
|
||||
record["exif_datetime"] = _mtime_str(path)
|
||||
|
||||
return record
|
||||
|
||||
|
||||
# ── Union-Find for phash grouping ────────────────────────────────────────────
|
||||
|
||||
class UnionFind:
|
||||
def __init__(self):
|
||||
self.parent: dict[int, int] = {}
|
||||
|
||||
def find(self, x: int) -> int:
|
||||
if x not in self.parent:
|
||||
self.parent[x] = x
|
||||
if self.parent[x] != x:
|
||||
self.parent[x] = self.find(self.parent[x])
|
||||
return self.parent[x]
|
||||
|
||||
def union(self, x: int, y: int):
|
||||
px, py = self.find(x), self.find(y)
|
||||
if px != py:
|
||||
self.parent[px] = py
|
||||
|
||||
def groups(self) -> dict[int, list[int]]:
|
||||
from collections import defaultdict
|
||||
result: dict[int, list[int]] = defaultdict(list)
|
||||
for x in self.parent:
|
||||
result[self.find(x)].append(x)
|
||||
return {k: v for k, v in result.items() if len(v) >= 2}
|
||||
|
||||
|
||||
# ── Detection passes ──────────────────────────────────────────────────────────
|
||||
|
||||
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
||||
"""Return file_id of highest resolution member; tie-break by size then oldest date."""
|
||||
def score(m):
|
||||
w = m["width"] or 0
|
||||
h = m["height"] or 0
|
||||
size = m["file_size"] or 0
|
||||
dt = m["exif_datetime"] or "9999"
|
||||
return (w * h, size, dt)
|
||||
|
||||
best = max(members, key=lambda m: (
|
||||
(m["width"] or 0) * (m["height"] or 0),
|
||||
m["file_size"] or 0,
|
||||
# older date = better; invert by negating epoch or use str comparison inverted
|
||||
))
|
||||
return best["id"]
|
||||
|
||||
|
||||
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
||||
def key(m):
|
||||
return m["exif_datetime"] or "9999"
|
||||
return min(members, key=key)["id"]
|
||||
|
||||
|
||||
def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT sha256, COUNT(*) as cnt
|
||||
FROM files
|
||||
WHERE sha256 IS NOT NULL
|
||||
GROUP BY sha256
|
||||
HAVING cnt > 1
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
sha = row["sha256"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
FROM files WHERE sha256 = ?
|
||||
""", (sha,))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('sha256', ?)",
|
||||
(sha,),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
# Exclude files already in sha256 groups
|
||||
cur.execute("""
|
||||
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
|
||||
FROM files f
|
||||
WHERE f.phash IS NOT NULL
|
||||
AND f.extension NOT IN (
|
||||
'.mp4','.mov','.avi','.mkv','.m4v','.3gp','.wmv','.mts','.m2ts'
|
||||
)
|
||||
AND f.id NOT IN (
|
||||
SELECT dm.file_id FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method = 'sha256'
|
||||
)
|
||||
""")
|
||||
rows = [dict(r) for r in cur.fetchall()]
|
||||
|
||||
if len(rows) < 2:
|
||||
return
|
||||
|
||||
# Bucket by first 2 hex chars to reduce O(n²) comparisons
|
||||
buckets: dict[str, list[dict]] = {}
|
||||
for r in rows:
|
||||
key = r["phash"][:2]
|
||||
buckets.setdefault(key, []).append(r)
|
||||
|
||||
uf = UnionFind()
|
||||
# Ensure all IDs are registered
|
||||
for r in rows:
|
||||
uf.find(r["id"])
|
||||
|
||||
THRESHOLD = 10
|
||||
for bucket in buckets.values():
|
||||
for i in range(len(bucket)):
|
||||
for j in range(i + 1, len(bucket)):
|
||||
a, b = bucket[i], bucket[j]
|
||||
try:
|
||||
dist = imagehash.hex_to_hash(a["phash"]) - imagehash.hex_to_hash(b["phash"])
|
||||
if dist <= THRESHOLD:
|
||||
uf.union(a["id"], b["id"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
id_map = {r["id"]: r for r in rows}
|
||||
for _, member_ids in uf.groups().items():
|
||||
members = [id_map[mid] for mid in member_ids if mid in id_map]
|
||||
if len(members) < 2:
|
||||
continue
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
keeper = id_map[keeper_id]
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('phash', ?)",
|
||||
(keeper["phash"],),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT exif_datetime, exif_device, COUNT(*) as cnt
|
||||
FROM files
|
||||
WHERE exif_datetime IS NOT NULL
|
||||
AND exif_device IS NOT NULL
|
||||
AND id NOT IN (
|
||||
SELECT file_id FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method IN ('sha256', 'phash')
|
||||
)
|
||||
GROUP BY exif_datetime, exif_device
|
||||
HAVING cnt > 1
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
dt, dev = row["exif_datetime"], row["exif_device"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
FROM files
|
||||
WHERE exif_datetime = ? AND exif_device = ?
|
||||
""", (dt, dev))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
method_value = f"{dt}::{dev}"
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('exif', ?)",
|
||||
(method_value,),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT file_size, width, height, COUNT(*) as cnt
|
||||
FROM files
|
||||
WHERE file_size IS NOT NULL
|
||||
AND width IS NOT NULL
|
||||
AND height IS NOT NULL
|
||||
AND id NOT IN (
|
||||
SELECT file_id FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method IN ('sha256', 'phash', 'exif')
|
||||
)
|
||||
GROUP BY file_size, width, height
|
||||
HAVING cnt > 1
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
fs, w, h = row["file_size"], row["width"], row["height"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
FROM files
|
||||
WHERE file_size = ? AND width = ? AND height = ?
|
||||
""", (fs, w, h))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
keeper_id = _suggested_keeper_oldest(members)
|
||||
method_value = f"{fs}::{w}x{h}"
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
|
||||
(method_value,),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
# ── Main scan entry point ─────────────────────────────────────────────────────
|
||||
|
||||
def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
"""Main scan function — runs in background thread."""
|
||||
global scan_state
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
try:
|
||||
# ── Phase: discovery ──────────────────────────────────────────────
|
||||
scan_state.update(phase="discovery", progress=0, total=0,
|
||||
message="Discovering files...")
|
||||
|
||||
all_files = []
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
||||
for fname in files:
|
||||
if fname.endswith(".json"):
|
||||
continue
|
||||
ext = Path(fname).suffix.lower()
|
||||
if ext in SUPPORTED_EXT:
|
||||
all_files.append(os.path.join(root, fname))
|
||||
|
||||
scan_state["total"] = len(all_files)
|
||||
scan_state["message"] = f"Found {len(all_files):,} files"
|
||||
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
# ── Mode: full reset ──────────────────────────────────────────────
|
||||
if mode == "full_reset":
|
||||
cur.execute("DELETE FROM duplicate_members")
|
||||
cur.execute("DELETE FROM duplicate_groups")
|
||||
cur.execute("DELETE FROM files")
|
||||
con.commit()
|
||||
|
||||
# ── Phase: takeout pre-processing ─────────────────────────────────
|
||||
scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
|
||||
if is_takeout_folder(folder_path):
|
||||
scan_state["message"] = "Processing Google Takeout sidecars..."
|
||||
process_takeout(folder_path, DB_PATH)
|
||||
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
# ── Phase: indexing ───────────────────────────────────────────────
|
||||
scan_state.update(phase="indexing", progress=0,
|
||||
message="Indexing files (SHA-256 + EXIF + dimensions)...")
|
||||
|
||||
for i, path in enumerate(all_files):
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
scan_state["progress"] = i + 1
|
||||
scan_state["message"] = f"Indexing: {Path(path).name}"
|
||||
|
||||
# Check existing record
|
||||
cur.execute("SELECT id, file_size, updated_at FROM files WHERE path = ?", (path,))
|
||||
existing = cur.fetchone()
|
||||
|
||||
try:
|
||||
current_size = os.path.getsize(path)
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if existing and mode in ("incremental", "new_files"):
|
||||
if mode == "new_files":
|
||||
# Skip entirely — don't re-hash existing files
|
||||
cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
|
||||
continue
|
||||
# Incremental: skip if size unchanged (use size as proxy for change)
|
||||
if existing["file_size"] == current_size:
|
||||
cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
|
||||
continue
|
||||
# File changed — re-hash, clear group memberships
|
||||
cur.execute(
|
||||
"DELETE FROM duplicate_members WHERE file_id = ?", (existing["id"],)
|
||||
)
|
||||
|
||||
try:
|
||||
record = extract_file(path)
|
||||
except Exception as e:
|
||||
cur.execute(
|
||||
"INSERT OR IGNORE INTO files (path, filename, extension, scan_id, status) "
|
||||
"VALUES (?, ?, ?, ?, 'error')",
|
||||
(path, Path(path).name, Path(path).suffix.lower(), scan_id),
|
||||
)
|
||||
cur.execute(
|
||||
"UPDATE files SET status='error', scan_id=?, updated_at=CURRENT_TIMESTAMP "
|
||||
"WHERE path=?",
|
||||
(scan_id, path),
|
||||
)
|
||||
con.commit()
|
||||
continue
|
||||
|
||||
record["scan_id"] = scan_id
|
||||
if existing:
|
||||
cur.execute("""
|
||||
UPDATE files SET
|
||||
filename=:filename, extension=:extension, file_size=:file_size,
|
||||
mime_type=:mime_type, sha256=:sha256,
|
||||
exif_datetime=:exif_datetime, exif_device=:exif_device,
|
||||
width=:width, height=:height, scan_id=:scan_id,
|
||||
status='pending', updated_at=CURRENT_TIMESTAMP
|
||||
WHERE path=:path
|
||||
""", record)
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT OR IGNORE INTO files
|
||||
(path, filename, extension, file_size, mime_type, sha256,
|
||||
exif_datetime, exif_device, width, height, scan_id, status)
|
||||
VALUES
|
||||
(:path, :filename, :extension, :file_size, :mime_type, :sha256,
|
||||
:exif_datetime, :exif_device, :width, :height, :scan_id, 'pending')
|
||||
""", record)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
con.commit()
|
||||
|
||||
con.commit()
|
||||
|
||||
# ── Phase: phash ──────────────────────────────────────────────────
|
||||
scan_state.update(phase="phash", progress=0,
|
||||
message="Computing perceptual hashes...")
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, path FROM files
|
||||
WHERE extension IN (
|
||||
'.jpg','.jpeg','.png','.gif','.bmp','.tiff','.tif',
|
||||
'.webp','.heic','.heif','.raw','.cr2','.nef','.arw',
|
||||
'.dng','.orf','.rw2','.pef','.srw','.x3f'
|
||||
) AND phash IS NULL AND status != 'error'
|
||||
""")
|
||||
photo_rows = cur.fetchall()
|
||||
scan_state["total"] = len(photo_rows)
|
||||
|
||||
for i, row in enumerate(photo_rows):
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
scan_state["progress"] = i + 1
|
||||
scan_state["message"] = f"Phash: {Path(row['path']).name}"
|
||||
ph = _phash(row["path"])
|
||||
if ph:
|
||||
cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
|
||||
if (i + 1) % 200 == 0:
|
||||
con.commit()
|
||||
|
||||
con.commit()
|
||||
|
||||
# ── Phase: grouping ───────────────────────────────────────────────
|
||||
scan_state.update(phase="grouping", progress=0, total=4,
|
||||
message="Running duplicate detection...")
|
||||
|
||||
if mode in ("incremental", "full_reset", "regroup"):
|
||||
cur.execute("DELETE FROM duplicate_members")
|
||||
cur.execute("DELETE FROM duplicate_groups")
|
||||
con.commit()
|
||||
elif mode == "new_files":
|
||||
# Only clear groups containing new files
|
||||
cur.execute("""
|
||||
DELETE FROM duplicate_groups WHERE id IN (
|
||||
SELECT DISTINCT dm.group_id FROM duplicate_members dm
|
||||
JOIN files f ON f.id = dm.file_id
|
||||
WHERE f.scan_id = ?
|
||||
)
|
||||
""", (scan_id,))
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 1/4: SHA-256 exact duplicates..."
|
||||
_run_sha256_pass(con, scan_id)
|
||||
scan_state["progress"] = 1
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 2/4: Perceptual hash similarity..."
|
||||
_run_phash_pass(con, scan_id)
|
||||
scan_state["progress"] = 2
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 3/4: EXIF timestamp + device..."
|
||||
_run_exif_pass(con, scan_id)
|
||||
scan_state["progress"] = 3
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 4/4: File size + dimensions..."
|
||||
_run_filesize_pass(con, scan_id)
|
||||
scan_state["progress"] = 4
|
||||
con.commit()
|
||||
|
||||
# ── Restore keeper statuses for mode=incremental ──────────────────
|
||||
if mode == "incremental":
|
||||
# If a previously marked keeper no longer appears in any group, reset to pending
|
||||
cur.execute("""
|
||||
UPDATE files SET status='pending'
|
||||
WHERE status='keeper'
|
||||
AND id NOT IN (
|
||||
SELECT file_id FROM duplicate_members WHERE is_keeper=1
|
||||
)
|
||||
""")
|
||||
con.commit()
|
||||
|
||||
# Update scan record
|
||||
cur.execute(
|
||||
"UPDATE scans SET completed_at=CURRENT_TIMESTAMP, total_files=?, status='complete' "
|
||||
"WHERE id=?",
|
||||
(len(all_files), scan_id),
|
||||
)
|
||||
con.commit()
|
||||
|
||||
scan_state.update(status="complete", phase="done",
|
||||
message="Scan complete.", progress=scan_state["total"])
|
||||
_update_stats()
|
||||
|
||||
except Exception as e:
|
||||
scan_state.update(status="error", message=str(e))
|
||||
try:
|
||||
_mark_scan(cur, scan_id, "error")
|
||||
con.commit()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
def _mark_scan(cur, scan_id: int, status: str):
|
||||
cur.execute(
|
||||
"UPDATE scans SET completed_at=CURRENT_TIMESTAMP, status=? WHERE id=?",
|
||||
(status, scan_id),
|
||||
)
|
||||
|
||||
|
||||
def _update_stats():
|
||||
"""Refresh stats in scan_state."""
|
||||
try:
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
|
||||
total_files = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status='redundant'")
|
||||
r = cur.fetchone()
|
||||
dup_count = r[0] or 0
|
||||
dup_size = r[1] or 0
|
||||
|
||||
for method in ("sha256", "phash", "exif", "filesize"):
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,)
|
||||
)
|
||||
cur.execute("""
|
||||
SELECT method,
|
||||
COUNT(*) as groups,
|
||||
(SELECT COUNT(*) FROM duplicate_members dm2
|
||||
JOIN duplicate_groups dg2 ON dg2.id=dm2.group_id
|
||||
WHERE dg2.method=dg.method) as files
|
||||
FROM duplicate_groups dg
|
||||
GROUP BY method
|
||||
""")
|
||||
by_method = {r["method"]: {"groups": r["groups"], "files": r["files"]}
|
||||
for r in cur.fetchall()}
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
|
||||
reviewed = cur.fetchone()[0]
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
|
||||
pending = cur.fetchone()[0]
|
||||
|
||||
scan_state["stats"] = {
|
||||
"total_files": total_files,
|
||||
"duplicate_files": dup_count,
|
||||
"duplicate_size_bytes": dup_size,
|
||||
"groups_by_method": by_method,
|
||||
"reviewed": reviewed,
|
||||
"pending": pending,
|
||||
}
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
Reference in New Issue
Block a user