Initial implementation of duplicate finder

Full project per spec: FastAPI backend, 4-method duplicate detection
(SHA-256, phash, EXIF, filesize), Google Takeout pre-processor,
4 scan modes, and dark-theme vanilla JS gallery frontend.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
tocmo
2026-04-04 23:42:58 -04:00
commit 868da9016d
8 changed files with 2993 additions and 0 deletions

599
app/main.py Normal file
View File

@@ -0,0 +1,599 @@
"""
FastAPI application — all API routes for the duplicate finder.
"""
import csv
import io
import os
import sqlite3
import subprocess
import threading
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, HTTPException, Query, Request
from fastapi.responses import (
FileResponse, JSONResponse, Response, StreamingResponse
)
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel
import scanner as sc
app = FastAPI(title="Duplicate Finder")
templates = Jinja2Templates(directory="/app/templates")
app.mount("/static", StaticFiles(directory="/app/static"), name="static")
METHOD_META = {
"sha256": {"color": "#378ADD", "label": "Exact copy"},
"phash": {"color": "#9b7de8", "label": "Visual match"},
"exif": {"color": "#e2a43a", "label": "Same moment"},
"filesize": {"color": "#888780", "label": "Possible match"},
}
# ── Startup ───────────────────────────────────────────────────────────────────
@app.on_event("startup")
def startup():
sc.init_db()
# ── Frontend ──────────────────────────────────────────────────────────────────
@app.get("/")
def index(request: Request):
return templates.TemplateResponse("index.html", {"request": request})
# ── DB helper ─────────────────────────────────────────────────────────────────
def get_db() -> sqlite3.Connection:
return sc.get_db()
# ── Scan management ───────────────────────────────────────────────────────────
class ScanStartBody(BaseModel):
folder_path: str
mode: str = "incremental"
@app.post("/api/scan/start")
def scan_start(body: ScanStartBody):
if sc.scan_state["status"] == "running":
raise HTTPException(400, "A scan is already running")
mode = body.mode
if mode not in ("incremental", "full_reset", "new_files", "regroup"):
raise HTTPException(400, f"Unknown scan mode: {mode}")
if mode == "full_reset":
con = get_db()
cur = con.cursor()
cur.execute("DELETE FROM duplicate_members")
cur.execute("DELETE FROM duplicate_groups")
cur.execute("DELETE FROM files")
cur.execute("DELETE FROM scans")
con.commit()
con.close()
con = get_db()
cur = con.cursor()
cur.execute(
"INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
(body.folder_path,),
)
scan_id = cur.lastrowid
con.commit()
con.close()
sc.scan_state.update(
scan_id=scan_id,
status="running",
phase="discovery",
progress=0,
total=0,
message="Starting...",
cancel_requested=False,
stats={},
)
thread = threading.Thread(
target=sc.run_scan,
args=(body.folder_path, scan_id, mode),
daemon=True,
)
thread.start()
return {"scan_id": scan_id}
@app.get("/api/scan/status")
def scan_status():
state = sc.scan_state
con = get_db()
cur = con.cursor()
# Build group counts per method
stats = {}
for method in ("sha256", "phash", "exif", "filesize"):
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
stats[f"{method}_groups"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups")
stats["total_groups"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
stats["reviewed"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
stats["pending"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
stats["total_files"] = cur.fetchone()[0]
con.close()
return {
"scan_id": state["scan_id"],
"status": state["status"],
"phase": state["phase"],
"progress": state["progress"],
"total": state["total"],
"message": state["message"],
"stats": stats,
}
@app.post("/api/scan/cancel")
def scan_cancel():
if sc.scan_state["status"] != "running":
raise HTTPException(400, "No scan is currently running")
sc.scan_state["cancel_requested"] = True
return {"success": True}
@app.delete("/api/scan/reset")
def scan_reset(confirm: str = Query("")):
if confirm != "RESET":
raise HTTPException(400, "Pass ?confirm=RESET to confirm")
con = get_db()
cur = con.cursor()
cur.execute("DELETE FROM duplicate_members")
cur.execute("DELETE FROM duplicate_groups")
cur.execute("DELETE FROM files")
cur.execute("DELETE FROM scans")
con.commit()
con.close()
sc.scan_state.update(
scan_id=None, status="idle", phase="idle",
progress=0, total=0, message="", stats={},
)
return {"success": True}
# ── Duplicate groups ──────────────────────────────────────────────────────────
@app.get("/api/groups")
def list_groups(
method: str = "all",
reviewed: str = "false",
sort: str = "count",
offset: int = 0,
limit: int = 50,
):
con = get_db()
cur = con.cursor()
where = []
params: list = []
if method != "all":
where.append("dg.method = ?")
params.append(method)
if reviewed == "false":
where.append("dg.reviewed = 0")
elif reviewed == "true":
where.append("dg.reviewed = 1")
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
order = {
"count": "member_count DESC",
"method": "dg.method, member_count DESC",
"date": "dg.created_at DESC",
}.get(sort, "member_count DESC")
cur.execute(f"""
SELECT COUNT(*) FROM duplicate_groups dg {where_clause}
""", params)
total = cur.fetchone()[0]
cur.execute(f"""
SELECT dg.id, dg.method, dg.reviewed,
COUNT(dm.id) as member_count,
(SELECT dm2.file_id FROM duplicate_members dm2
WHERE dm2.group_id = dg.id AND dm2.suggested = 1
LIMIT 1) as suggested_file_id
FROM duplicate_groups dg
LEFT JOIN duplicate_members dm ON dm.group_id = dg.id
{where_clause}
GROUP BY dg.id
ORDER BY {order}
LIMIT ? OFFSET ?
""", params + [limit, offset])
groups = []
for row in cur.fetchall():
meta = METHOD_META.get(row["method"], {"color": "#888", "label": row["method"]})
suggested = None
if row["suggested_file_id"]:
cur.execute(
"SELECT id, filename, width, height FROM files WHERE id=?",
(row["suggested_file_id"],),
)
f = cur.fetchone()
if f:
suggested = {
"file_id": f["id"],
"filename": f["filename"],
"width": f["width"],
"height": f["height"],
"thumb_url": f"/api/thumb/{f['id']}",
}
groups.append({
"id": row["id"],
"method": row["method"],
"method_color": meta["color"],
"method_label": meta["label"],
"member_count": row["member_count"],
"reviewed": bool(row["reviewed"]),
"suggested_keeper": suggested,
})
con.close()
return {"total": total, "groups": groups}
@app.get("/api/groups/{group_id}")
def get_group(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT * FROM duplicate_groups WHERE id=?", (group_id,))
grp = cur.fetchone()
if not grp:
raise HTTPException(404, "Group not found")
meta = METHOD_META.get(grp["method"], {"color": "#888", "label": grp["method"]})
cur.execute("""
SELECT f.id, f.filename, f.path, f.file_size, f.width, f.height,
f.mime_type, f.exif_datetime, f.exif_device,
f.is_takeout, f.is_edited,
dm.is_keeper, dm.suggested
FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id
WHERE dm.group_id = ?
ORDER BY dm.suggested DESC, f.width * f.height DESC
""", (group_id,))
members = []
for r in cur.fetchall():
members.append({
"file_id": r["id"],
"filename": r["filename"],
"path": r["path"],
"file_size": r["file_size"],
"width": r["width"],
"height": r["height"],
"mime_type": r["mime_type"],
"exif_datetime": r["exif_datetime"],
"exif_device": r["exif_device"],
"is_takeout": bool(r["is_takeout"]),
"is_edited": bool(r["is_edited"]) if r["is_edited"] is not None else False,
"is_suggested": bool(r["suggested"]),
"is_keeper": bool(r["is_keeper"]),
"thumb_url": f"/api/thumb/{r['id']}",
})
con.close()
return {
"id": grp["id"],
"method": grp["method"],
"method_color": meta["color"],
"method_label": meta["label"],
"method_value": grp["method_value"],
"reviewed": bool(grp["reviewed"]),
"members": members,
}
# ── Decisions ─────────────────────────────────────────────────────────────────
class DecideBody(BaseModel):
keeper_file_id: int
@app.post("/api/groups/{group_id}/decide")
def decide(group_id: int, body: DecideBody):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
all_members = [r["file_id"] for r in cur.fetchall()]
if body.keeper_file_id not in all_members:
raise HTTPException(400, "keeper_file_id is not a member of this group")
for fid in all_members:
is_k = 1 if fid == body.keeper_file_id else 0
cur.execute(
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
(is_k, group_id, fid),
)
status = "keeper" if is_k else "redundant"
cur.execute("UPDATE files SET status=? WHERE id=?", (status, fid))
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/{group_id}/skip")
def skip_group(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/{group_id}/keep-all")
def keep_all(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
for r in cur.fetchall():
cur.execute(
"UPDATE duplicate_members SET is_keeper=1 WHERE group_id=? AND file_id=?",
(group_id, r["file_id"]),
)
cur.execute("UPDATE files SET status='keeper' WHERE id=?", (r["file_id"],))
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/{group_id}/unreview")
def unreview_group(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
for r in cur.fetchall():
cur.execute(
"UPDATE duplicate_members SET is_keeper=0 WHERE group_id=? AND file_id=?",
(group_id, r["file_id"]),
)
cur.execute("UPDATE files SET status='pending' WHERE id=?", (r["file_id"],))
cur.execute("UPDATE duplicate_groups SET reviewed=0 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/auto-resolve-exact")
def auto_resolve_exact():
con = get_db()
cur = con.cursor()
cur.execute("""
SELECT id FROM duplicate_groups
WHERE method='sha256' AND reviewed=0
""")
groups = [r["id"] for r in cur.fetchall()]
resolved = 0
for gid in groups:
cur.execute("""
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id
WHERE dm.group_id = ?
""", (gid,))
members = [dict(r) for r in cur.fetchall()]
if not members:
continue
keeper_id = sc._suggested_keeper_by_resolution(members)
for m in members:
is_k = 1 if m["id"] == keeper_id else 0
cur.execute(
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
(is_k, gid, m["id"]),
)
cur.execute(
"UPDATE files SET status=? WHERE id=?",
("keeper" if is_k else "redundant", m["id"]),
)
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (gid,))
resolved += 1
con.commit()
con.close()
return {"resolved": resolved}
# ── Files + thumbnails ────────────────────────────────────────────────────────
VIDEO_PLACEHOLDER_SVG = """<svg xmlns="http://www.w3.org/2000/svg" width="200" height="200"
viewBox="0 0 200 200">
<rect width="200" height="200" fill="#1e1e2e"/>
<polygon points="75,55 75,145 145,100" fill="#9b7de8"/>
</svg>"""
VIDEO_EXT = {".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", ".wmv", ".mts", ".m2ts"}
@app.get("/api/thumb/{file_id}")
def get_thumb(file_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT path, mime_type, extension FROM files WHERE id=?", (file_id,))
row = cur.fetchone()
con.close()
if not row:
raise HTTPException(404, "File not found")
path = row["path"]
ext = (row["extension"] or "").lower()
if not os.path.isfile(path):
raise HTTPException(404, "File not on disk")
if ext in VIDEO_EXT:
# Try ffmpeg for first frame
try:
result = subprocess.run(
[
"ffmpeg", "-i", path,
"-vframes", "1", "-f", "image2", "-vcodec", "mjpeg",
"pipe:1",
],
capture_output=True, timeout=10,
)
if result.returncode == 0 and result.stdout:
return Response(content=result.stdout, media_type="image/jpeg")
except Exception:
pass
return Response(content=VIDEO_PLACEHOLDER_SVG, media_type="image/svg+xml")
# Serve photo directly
mime = row["mime_type"] or "application/octet-stream"
return FileResponse(path, media_type=mime)
@app.get("/api/files/{file_id}")
def get_file_meta(file_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT * FROM files WHERE id=?", (file_id,))
row = cur.fetchone()
con.close()
if not row:
raise HTTPException(404, "File not found")
return dict(row)
# ── Stats ─────────────────────────────────────────────────────────────────────
@app.get("/api/stats")
def get_stats():
con = get_db()
cur = con.cursor()
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status != 'error'")
r = cur.fetchone()
total_files = r[0] or 0
total_size = r[1] or 0
cur.execute("""
SELECT COUNT(*), SUM(f.file_size)
FROM files f
JOIN duplicate_members dm ON dm.file_id = f.id
WHERE dm.is_keeper = 0
""")
r = cur.fetchone()
dup_files = r[0] or 0
dup_size = r[1] or 0
by_method = {}
for method in ("sha256", "phash", "exif", "filesize"):
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
groups = cur.fetchone()[0]
cur.execute("""
SELECT COUNT(*) FROM duplicate_members dm
JOIN duplicate_groups dg ON dg.id = dm.group_id
WHERE dg.method = ?
""", (method,))
files = cur.fetchone()[0]
by_method[method] = {"groups": groups, "files": files}
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
reviewed = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
pending = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM files WHERE is_takeout=1")
takeout_files = cur.fetchone()[0]
con.close()
return {
"total_files": total_files,
"total_size_bytes": total_size,
"duplicate_files": dup_files,
"duplicate_size_bytes": dup_size,
"groups_by_method": by_method,
"reviewed": reviewed,
"pending": pending,
"takeout_files": takeout_files,
}
# ── Export ────────────────────────────────────────────────────────────────────
@app.get("/api/export/csv")
def export_csv():
con = get_db()
cur = con.cursor()
cur.execute("""
SELECT dg.id as group_id, dg.method, f.id as file_id,
f.path, f.filename, f.file_size,
f.width, f.height, f.exif_datetime, f.exif_device,
dm.is_keeper,
CASE WHEN dm.is_keeper=0 AND dg.reviewed=1 THEN 1 ELSE 0 END as is_redundant,
dg.reviewed
FROM duplicate_groups dg
JOIN duplicate_members dm ON dm.group_id = dg.id
JOIN files f ON f.id = dm.file_id
ORDER BY dg.id, dm.is_keeper DESC
""")
rows = cur.fetchall()
con.close()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow([
"group_id", "method", "file_id", "path", "filename",
"size", "width", "height", "exif_date", "device",
"is_keeper", "is_redundant", "reviewed",
])
for r in rows:
writer.writerow([
r["group_id"], r["method"], r["file_id"],
r["path"], r["filename"], r["file_size"],
r["width"], r["height"], r["exif_datetime"], r["exif_device"],
r["is_keeper"], r["is_redundant"], r["reviewed"],
])
output.seek(0)
return StreamingResponse(
iter([output.getvalue()]),
media_type="text/csv",
headers={"Content-Disposition": "attachment; filename=dup-finder-export.csv"},
)

758
app/scanner.py Normal file
View File

@@ -0,0 +1,758 @@
"""
File scanner: discovery, per-file extraction, and all 4 duplicate detection passes.
"""
import hashlib
import mimetypes
import os
import sqlite3
import subprocess
from datetime import datetime
from pathlib import Path
import imagehash
from PIL import Image, ExifTags, UnidentifiedImageError
try:
from pillow_heif import register_heif_opener
register_heif_opener()
except ImportError:
pass
from takeout import is_takeout_folder, process_takeout
PHOTO_EXT = {
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif",
".webp", ".heic", ".heif", ".raw", ".cr2", ".nef", ".arw",
".dng", ".orf", ".rw2", ".pef", ".srw", ".x3f",
}
VIDEO_EXT = {
".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp",
".wmv", ".mts", ".m2ts",
}
SUPPORTED_EXT = PHOTO_EXT | VIDEO_EXT
DB_PATH = "/data/dupfinder.db"
# Shared scan state (updated by background thread, read by status endpoint)
scan_state = {
"scan_id": None,
"status": "idle", # idle | running | complete | error | cancelled
"phase": "idle", # discovery | takeout | indexing | phash | grouping | done
"progress": 0,
"total": 0,
"message": "",
"cancel_requested": False,
"stats": {},
}
# ── DB helpers ────────────────────────────────────────────────────────────────
def get_db() -> sqlite3.Connection:
con = sqlite3.connect(DB_PATH, timeout=30)
con.row_factory = sqlite3.Row
con.execute("PRAGMA journal_mode=WAL")
con.execute("PRAGMA foreign_keys=ON")
return con
def init_db():
con = get_db()
cur = con.cursor()
cur.executescript("""
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT UNIQUE NOT NULL,
filename TEXT NOT NULL,
extension TEXT,
file_size INTEGER,
mime_type TEXT,
sha256 TEXT,
phash TEXT,
exif_datetime TEXT,
exif_device TEXT,
width INTEGER,
height INTEGER,
is_takeout INTEGER DEFAULT 0,
is_edited INTEGER DEFAULT 0,
takeout_json TEXT,
scan_id INTEGER,
status TEXT DEFAULT 'pending',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS scans (
id INTEGER PRIMARY KEY AUTOINCREMENT,
folder_path TEXT NOT NULL,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP,
total_files INTEGER DEFAULT 0,
status TEXT DEFAULT 'running'
);
CREATE TABLE IF NOT EXISTS duplicate_groups (
id INTEGER PRIMARY KEY AUTOINCREMENT,
method TEXT NOT NULL,
method_value TEXT,
reviewed INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS duplicate_members (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER REFERENCES duplicate_groups(id) ON DELETE CASCADE,
file_id INTEGER REFERENCES files(id) ON DELETE CASCADE,
is_keeper INTEGER DEFAULT 0,
suggested INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_sha256 ON files(sha256);
CREATE INDEX IF NOT EXISTS idx_phash ON files(phash);
CREATE INDEX IF NOT EXISTS idx_exif_dt ON files(exif_datetime, exif_device);
CREATE INDEX IF NOT EXISTS idx_size_dim ON files(file_size, width, height);
CREATE INDEX IF NOT EXISTS idx_status ON files(status);
""")
con.commit()
con.close()
# ── Per-file extraction ───────────────────────────────────────────────────────
def _sha256(path: str) -> str:
h = hashlib.sha256()
with open(path, "rb") as f:
while chunk := f.read(65536):
h.update(chunk)
return h.hexdigest()
def _exif_data(path: str) -> tuple[str | None, str | None]:
"""Returns (exif_datetime, exif_device) or (None, None)."""
try:
img = Image.open(path)
exif_raw = img._getexif()
if not exif_raw:
return None, None
exif = {ExifTags.TAGS.get(k, k): v for k, v in exif_raw.items()}
dt = exif.get("DateTimeOriginal") or exif.get("DateTime")
if dt:
try:
dt = datetime.strptime(dt, "%Y:%m:%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S")
except ValueError:
dt = None
make = str(exif.get("Make", "")).strip()
model = str(exif.get("Model", "")).strip()
device = (make + " " + model).strip() if (make or model) else None
return dt, device
except Exception:
return None, None
def _image_dims(path: str) -> tuple[int | None, int | None]:
try:
with Image.open(path) as img:
return img.size # (width, height)
except Exception:
return None, None
def _phash(path: str) -> str | None:
try:
with Image.open(path) as img:
return str(imagehash.phash(img))
except Exception:
return None
def _video_dims(path: str) -> tuple[int | None, int | None]:
try:
result = subprocess.run(
[
"ffprobe", "-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=width,height",
"-of", "csv=p=0",
path,
],
capture_output=True, text=True, timeout=10,
)
parts = result.stdout.strip().split(",")
if len(parts) == 2:
return int(parts[0]), int(parts[1])
except Exception:
pass
return None, None
def _mtime_str(path: str) -> str | None:
try:
ts = os.path.getmtime(path)
return datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S")
except Exception:
return None
def extract_file(path: str) -> dict:
ext = Path(path).suffix.lower()
filename = Path(path).name
is_photo = ext in PHOTO_EXT
is_video = ext in VIDEO_EXT
record = {
"path": path,
"filename": filename,
"extension": ext,
"file_size": None,
"mime_type": None,
"sha256": None,
"phash": None,
"exif_datetime": None,
"exif_device": None,
"width": None,
"height": None,
}
try:
record["file_size"] = os.path.getsize(path)
except OSError:
pass
record["mime_type"] = mimetypes.guess_type(path)[0]
try:
record["sha256"] = _sha256(path)
except OSError:
pass
if is_photo:
w, h = _image_dims(path)
record["width"], record["height"] = w, h
dt, device = _exif_data(path)
record["exif_datetime"] = dt or _mtime_str(path)
record["exif_device"] = device
# phash computed in separate phase for progress reporting
elif is_video:
w, h = _video_dims(path)
record["width"], record["height"] = w, h
record["exif_datetime"] = _mtime_str(path)
return record
# ── Union-Find for phash grouping ────────────────────────────────────────────
class UnionFind:
def __init__(self):
self.parent: dict[int, int] = {}
def find(self, x: int) -> int:
if x not in self.parent:
self.parent[x] = x
if self.parent[x] != x:
self.parent[x] = self.find(self.parent[x])
return self.parent[x]
def union(self, x: int, y: int):
px, py = self.find(x), self.find(y)
if px != py:
self.parent[px] = py
def groups(self) -> dict[int, list[int]]:
from collections import defaultdict
result: dict[int, list[int]] = defaultdict(list)
for x in self.parent:
result[self.find(x)].append(x)
return {k: v for k, v in result.items() if len(v) >= 2}
# ── Detection passes ──────────────────────────────────────────────────────────
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
"""Return file_id of highest resolution member; tie-break by size then oldest date."""
def score(m):
w = m["width"] or 0
h = m["height"] or 0
size = m["file_size"] or 0
dt = m["exif_datetime"] or "9999"
return (w * h, size, dt)
best = max(members, key=lambda m: (
(m["width"] or 0) * (m["height"] or 0),
m["file_size"] or 0,
# older date = better; invert by negating epoch or use str comparison inverted
))
return best["id"]
def _suggested_keeper_oldest(members: list[dict]) -> int:
def key(m):
return m["exif_datetime"] or "9999"
return min(members, key=key)["id"]
def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
cur = con.cursor()
cur.execute("""
SELECT sha256, COUNT(*) as cnt
FROM files
WHERE sha256 IS NOT NULL
GROUP BY sha256
HAVING cnt > 1
""")
rows = cur.fetchall()
for row in rows:
sha = row["sha256"]
cur.execute("""
SELECT id, width, height, file_size, exif_datetime
FROM files WHERE sha256 = ?
""", (sha,))
members = [dict(r) for r in cur.fetchall()]
keeper_id = _suggested_keeper_by_resolution(members)
cur.execute(
"INSERT INTO duplicate_groups (method, method_value) VALUES ('sha256', ?)",
(sha,),
)
group_id = cur.lastrowid
for m in members:
cur.execute(
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
)
def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
cur = con.cursor()
# Exclude files already in sha256 groups
cur.execute("""
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
FROM files f
WHERE f.phash IS NOT NULL
AND f.extension NOT IN (
'.mp4','.mov','.avi','.mkv','.m4v','.3gp','.wmv','.mts','.m2ts'
)
AND f.id NOT IN (
SELECT dm.file_id FROM duplicate_members dm
JOIN duplicate_groups dg ON dg.id = dm.group_id
WHERE dg.method = 'sha256'
)
""")
rows = [dict(r) for r in cur.fetchall()]
if len(rows) < 2:
return
# Bucket by first 2 hex chars to reduce O(n²) comparisons
buckets: dict[str, list[dict]] = {}
for r in rows:
key = r["phash"][:2]
buckets.setdefault(key, []).append(r)
uf = UnionFind()
# Ensure all IDs are registered
for r in rows:
uf.find(r["id"])
THRESHOLD = 10
for bucket in buckets.values():
for i in range(len(bucket)):
for j in range(i + 1, len(bucket)):
a, b = bucket[i], bucket[j]
try:
dist = imagehash.hex_to_hash(a["phash"]) - imagehash.hex_to_hash(b["phash"])
if dist <= THRESHOLD:
uf.union(a["id"], b["id"])
except Exception:
pass
id_map = {r["id"]: r for r in rows}
for _, member_ids in uf.groups().items():
members = [id_map[mid] for mid in member_ids if mid in id_map]
if len(members) < 2:
continue
keeper_id = _suggested_keeper_by_resolution(members)
keeper = id_map[keeper_id]
cur.execute(
"INSERT INTO duplicate_groups (method, method_value) VALUES ('phash', ?)",
(keeper["phash"],),
)
group_id = cur.lastrowid
for m in members:
cur.execute(
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
)
def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
cur = con.cursor()
cur.execute("""
SELECT exif_datetime, exif_device, COUNT(*) as cnt
FROM files
WHERE exif_datetime IS NOT NULL
AND exif_device IS NOT NULL
AND id NOT IN (
SELECT file_id FROM duplicate_members dm
JOIN duplicate_groups dg ON dg.id = dm.group_id
WHERE dg.method IN ('sha256', 'phash')
)
GROUP BY exif_datetime, exif_device
HAVING cnt > 1
""")
rows = cur.fetchall()
for row in rows:
dt, dev = row["exif_datetime"], row["exif_device"]
cur.execute("""
SELECT id, width, height, file_size, exif_datetime
FROM files
WHERE exif_datetime = ? AND exif_device = ?
""", (dt, dev))
members = [dict(r) for r in cur.fetchall()]
keeper_id = _suggested_keeper_by_resolution(members)
method_value = f"{dt}::{dev}"
cur.execute(
"INSERT INTO duplicate_groups (method, method_value) VALUES ('exif', ?)",
(method_value,),
)
group_id = cur.lastrowid
for m in members:
cur.execute(
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
)
def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
cur = con.cursor()
cur.execute("""
SELECT file_size, width, height, COUNT(*) as cnt
FROM files
WHERE file_size IS NOT NULL
AND width IS NOT NULL
AND height IS NOT NULL
AND id NOT IN (
SELECT file_id FROM duplicate_members dm
JOIN duplicate_groups dg ON dg.id = dm.group_id
WHERE dg.method IN ('sha256', 'phash', 'exif')
)
GROUP BY file_size, width, height
HAVING cnt > 1
""")
rows = cur.fetchall()
for row in rows:
fs, w, h = row["file_size"], row["width"], row["height"]
cur.execute("""
SELECT id, width, height, file_size, exif_datetime
FROM files
WHERE file_size = ? AND width = ? AND height = ?
""", (fs, w, h))
members = [dict(r) for r in cur.fetchall()]
keeper_id = _suggested_keeper_oldest(members)
method_value = f"{fs}::{w}x{h}"
cur.execute(
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
(method_value,),
)
group_id = cur.lastrowid
for m in members:
cur.execute(
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
)
# ── Main scan entry point ─────────────────────────────────────────────────────
def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
"""Main scan function — runs in background thread."""
global scan_state
con = get_db()
cur = con.cursor()
try:
# ── Phase: discovery ──────────────────────────────────────────────
scan_state.update(phase="discovery", progress=0, total=0,
message="Discovering files...")
all_files = []
for root, dirs, files in os.walk(folder_path):
dirs[:] = [d for d in dirs if not d.startswith(".")]
for fname in files:
if fname.endswith(".json"):
continue
ext = Path(fname).suffix.lower()
if ext in SUPPORTED_EXT:
all_files.append(os.path.join(root, fname))
scan_state["total"] = len(all_files)
scan_state["message"] = f"Found {len(all_files):,} files"
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
con.commit()
scan_state["status"] = "cancelled"
return
# ── Mode: full reset ──────────────────────────────────────────────
if mode == "full_reset":
cur.execute("DELETE FROM duplicate_members")
cur.execute("DELETE FROM duplicate_groups")
cur.execute("DELETE FROM files")
con.commit()
# ── Phase: takeout pre-processing ─────────────────────────────────
scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
if is_takeout_folder(folder_path):
scan_state["message"] = "Processing Google Takeout sidecars..."
process_takeout(folder_path, DB_PATH)
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
con.commit()
scan_state["status"] = "cancelled"
return
# ── Phase: indexing ───────────────────────────────────────────────
scan_state.update(phase="indexing", progress=0,
message="Indexing files (SHA-256 + EXIF + dimensions)...")
for i, path in enumerate(all_files):
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
con.commit()
scan_state["status"] = "cancelled"
return
scan_state["progress"] = i + 1
scan_state["message"] = f"Indexing: {Path(path).name}"
# Check existing record
cur.execute("SELECT id, file_size, updated_at FROM files WHERE path = ?", (path,))
existing = cur.fetchone()
try:
current_size = os.path.getsize(path)
except OSError:
continue
if existing and mode in ("incremental", "new_files"):
if mode == "new_files":
# Skip entirely — don't re-hash existing files
cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
continue
# Incremental: skip if size unchanged (use size as proxy for change)
if existing["file_size"] == current_size:
cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
continue
# File changed — re-hash, clear group memberships
cur.execute(
"DELETE FROM duplicate_members WHERE file_id = ?", (existing["id"],)
)
try:
record = extract_file(path)
except Exception as e:
cur.execute(
"INSERT OR IGNORE INTO files (path, filename, extension, scan_id, status) "
"VALUES (?, ?, ?, ?, 'error')",
(path, Path(path).name, Path(path).suffix.lower(), scan_id),
)
cur.execute(
"UPDATE files SET status='error', scan_id=?, updated_at=CURRENT_TIMESTAMP "
"WHERE path=?",
(scan_id, path),
)
con.commit()
continue
record["scan_id"] = scan_id
if existing:
cur.execute("""
UPDATE files SET
filename=:filename, extension=:extension, file_size=:file_size,
mime_type=:mime_type, sha256=:sha256,
exif_datetime=:exif_datetime, exif_device=:exif_device,
width=:width, height=:height, scan_id=:scan_id,
status='pending', updated_at=CURRENT_TIMESTAMP
WHERE path=:path
""", record)
else:
cur.execute("""
INSERT OR IGNORE INTO files
(path, filename, extension, file_size, mime_type, sha256,
exif_datetime, exif_device, width, height, scan_id, status)
VALUES
(:path, :filename, :extension, :file_size, :mime_type, :sha256,
:exif_datetime, :exif_device, :width, :height, :scan_id, 'pending')
""", record)
if (i + 1) % 100 == 0:
con.commit()
con.commit()
# ── Phase: phash ──────────────────────────────────────────────────
scan_state.update(phase="phash", progress=0,
message="Computing perceptual hashes...")
cur.execute("""
SELECT id, path FROM files
WHERE extension IN (
'.jpg','.jpeg','.png','.gif','.bmp','.tiff','.tif',
'.webp','.heic','.heif','.raw','.cr2','.nef','.arw',
'.dng','.orf','.rw2','.pef','.srw','.x3f'
) AND phash IS NULL AND status != 'error'
""")
photo_rows = cur.fetchall()
scan_state["total"] = len(photo_rows)
for i, row in enumerate(photo_rows):
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
con.commit()
scan_state["status"] = "cancelled"
return
scan_state["progress"] = i + 1
scan_state["message"] = f"Phash: {Path(row['path']).name}"
ph = _phash(row["path"])
if ph:
cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
if (i + 1) % 200 == 0:
con.commit()
con.commit()
# ── Phase: grouping ───────────────────────────────────────────────
scan_state.update(phase="grouping", progress=0, total=4,
message="Running duplicate detection...")
if mode in ("incremental", "full_reset", "regroup"):
cur.execute("DELETE FROM duplicate_members")
cur.execute("DELETE FROM duplicate_groups")
con.commit()
elif mode == "new_files":
# Only clear groups containing new files
cur.execute("""
DELETE FROM duplicate_groups WHERE id IN (
SELECT DISTINCT dm.group_id FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id
WHERE f.scan_id = ?
)
""", (scan_id,))
con.commit()
scan_state["message"] = "Pass 1/4: SHA-256 exact duplicates..."
_run_sha256_pass(con, scan_id)
scan_state["progress"] = 1
con.commit()
scan_state["message"] = "Pass 2/4: Perceptual hash similarity..."
_run_phash_pass(con, scan_id)
scan_state["progress"] = 2
con.commit()
scan_state["message"] = "Pass 3/4: EXIF timestamp + device..."
_run_exif_pass(con, scan_id)
scan_state["progress"] = 3
con.commit()
scan_state["message"] = "Pass 4/4: File size + dimensions..."
_run_filesize_pass(con, scan_id)
scan_state["progress"] = 4
con.commit()
# ── Restore keeper statuses for mode=incremental ──────────────────
if mode == "incremental":
# If a previously marked keeper no longer appears in any group, reset to pending
cur.execute("""
UPDATE files SET status='pending'
WHERE status='keeper'
AND id NOT IN (
SELECT file_id FROM duplicate_members WHERE is_keeper=1
)
""")
con.commit()
# Update scan record
cur.execute(
"UPDATE scans SET completed_at=CURRENT_TIMESTAMP, total_files=?, status='complete' "
"WHERE id=?",
(len(all_files), scan_id),
)
con.commit()
scan_state.update(status="complete", phase="done",
message="Scan complete.", progress=scan_state["total"])
_update_stats()
except Exception as e:
scan_state.update(status="error", message=str(e))
try:
_mark_scan(cur, scan_id, "error")
con.commit()
except Exception:
pass
finally:
con.close()
def _mark_scan(cur, scan_id: int, status: str):
cur.execute(
"UPDATE scans SET completed_at=CURRENT_TIMESTAMP, status=? WHERE id=?",
(status, scan_id),
)
def _update_stats():
"""Refresh stats in scan_state."""
try:
con = get_db()
cur = con.cursor()
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
total_files = cur.fetchone()[0]
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status='redundant'")
r = cur.fetchone()
dup_count = r[0] or 0
dup_size = r[1] or 0
for method in ("sha256", "phash", "exif", "filesize"):
cur.execute(
"SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,)
)
cur.execute("""
SELECT method,
COUNT(*) as groups,
(SELECT COUNT(*) FROM duplicate_members dm2
JOIN duplicate_groups dg2 ON dg2.id=dm2.group_id
WHERE dg2.method=dg.method) as files
FROM duplicate_groups dg
GROUP BY method
""")
by_method = {r["method"]: {"groups": r["groups"], "files": r["files"]}
for r in cur.fetchall()}
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
reviewed = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
pending = cur.fetchone()[0]
scan_state["stats"] = {
"total_files": total_files,
"duplicate_files": dup_count,
"duplicate_size_bytes": dup_size,
"groups_by_method": by_method,
"reviewed": reviewed,
"pending": pending,
}
con.close()
except Exception:
pass

149
app/takeout.py Normal file
View File

@@ -0,0 +1,149 @@
"""
Google Takeout pre-processor.
Detects Takeout folder structures, reads JSON sidecars, and enriches
the files table with corrected timestamps, normalized filenames, and
edit-version flags.
"""
import json
import os
import re
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
# Google edit suffixes appended to filenames
EDIT_SUFFIXES = ("-edited", "-effects", "-smile", "-mix")
def _find_sidecar(media_path: str) -> str | None:
"""Return path to the JSON sidecar for a media file, or None."""
p = Path(media_path)
# Try filename.ext.json first, then filename.json
candidates = [
str(p) + ".json",
str(p.with_suffix(".json")),
]
for c in candidates:
if os.path.isfile(c):
return c
return None
def _strip_collision_suffix(filename: str) -> str:
"""Strip Google's (1), (2) collision suffixes from a filename."""
stem = Path(filename).stem
ext = Path(filename).suffix
cleaned = re.sub(r"\(\d+\)$", "", stem).rstrip()
return cleaned + ext
def _is_edited(filename: str) -> bool:
stem = Path(filename).stem.lower()
return any(stem.endswith(s) for s in EDIT_SUFFIXES)
def is_takeout_folder(folder_path: str) -> bool:
"""
Heuristic: walk folder looking for .json files whose names match
adjacent media files. If we find at least 5 such pairs, call it Takeout.
"""
count = 0
for root, dirs, files in os.walk(folder_path):
# Skip hidden dirs
dirs[:] = [d for d in dirs if not d.startswith(".")]
file_set = set(files)
for f in files:
if not f.endswith(".json"):
continue
# Check if a media file exists that this could be a sidecar for
base = f[:-5] # strip .json
if base in file_set:
count += 1
if count >= 5:
return True
return False
def process_takeout(folder_path: str, db_path: str) -> int:
"""
Walk folder_path, find all media files with JSON sidecars,
and enrich their DB records. Returns count of files enriched.
"""
con = sqlite3.connect(db_path)
con.row_factory = sqlite3.Row
cur = con.cursor()
enriched = 0
for root, dirs, files in os.walk(folder_path):
dirs[:] = [d for d in dirs if not d.startswith(".")]
for fname in files:
if fname.endswith(".json"):
continue
media_path = os.path.join(root, fname)
sidecar = _find_sidecar(media_path)
if not sidecar:
continue
try:
with open(sidecar, "r", encoding="utf-8") as f:
data = json.load(f)
except (json.JSONDecodeError, OSError):
continue
# Extract fields from sidecar
photo_taken_ts = None
try:
ts = int(data["photoTakenTime"]["timestamp"])
dt = datetime.fromtimestamp(ts, tz=timezone.utc)
photo_taken_ts = dt.strftime("%Y-%m-%dT%H:%M:%S")
except (KeyError, ValueError, TypeError):
pass
title = data.get("title", "")
takeout_json_str = json.dumps(data)
# Normalized filename: use title if present, else strip suffix from fname
if title:
normalized = _strip_collision_suffix(title)
else:
normalized = _strip_collision_suffix(fname)
edited = _is_edited(fname)
# Update the DB record for this file
updates = {
"is_takeout": 1,
"filename": normalized,
"takeout_json": takeout_json_str,
}
if photo_taken_ts:
updates["exif_datetime"] = photo_taken_ts
set_clause = ", ".join(f"{k} = ?" for k in updates)
values = list(updates.values()) + [media_path]
cur.execute(
f"UPDATE files SET {set_clause}, updated_at = CURRENT_TIMESTAMP "
f"WHERE path = ?",
values,
)
# Handle edited flag — add is_edited column if needed (migration-safe)
if edited:
try:
cur.execute(
"UPDATE files SET is_edited = 1 WHERE path = ?",
(media_path,),
)
except sqlite3.OperationalError:
pass # column doesn't exist yet, skip
if cur.rowcount > 0:
enriched += 1
con.commit()
con.close()
return enriched