Files
duplicate-finder/app/main.py
Carlos 3001be3a92 Fix correctness bugs in scanner and reset endpoint
- Defer Takeout sidecar enrichment until after indexing so its UPDATE
  statements actually match rows. Previously it ran first and silently
  no-op'd on the very first scan because no files existed in the DB yet.

- Preserve user review decisions across incremental and regroup rescans.
  The grouping phase wipes duplicate_groups/duplicate_members, which
  also wiped reviewed=1 / is_keeper flags. Now snapshots reviewed groups
  by (method, frozenset of member file_ids) before the wipe and re-applies
  them to any post-regrouping group whose member set is unchanged.

- Replace 2-hex-char phash bucketing with multi-index pigeonhole
  (16 nibble buckets per hash). At threshold=10, the previous bucketing
  missed any near-duplicate pair that differed in the first byte, since
  they landed in different buckets and were never compared. Caches
  imagehash.hex_to_hash() per phash and dedups pair comparisons.

- Rewrite _suggested_keeper_by_resolution: previous implementation had
  a dead inner score() function and the lambda was missing the date
  tie-breaker (left as a TODO comment). Now picks largest pixels, ties
  by file size, then by oldest exif_datetime.

- Filter phash candidates to length(phash)=16 to skip malformed hashes
  rather than relying on the silent except in the comparison loop.

- Reject /api/scan/reset while a scan is running. Resetting mid-scan
  wiped tables the running scan thread was still writing to.

- Also clears stale 'redundant' file status (not just 'keeper') when
  a file no longer appears in any group after regrouping.
2026-04-24 00:42:13 -04:00

700 lines
22 KiB
Python

"""
FastAPI application — all API routes for the duplicate finder.
"""
import csv
import io
import os
import sqlite3
import subprocess
import threading
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, HTTPException, Query, Request
from fastapi.responses import (
FileResponse, JSONResponse, Response, StreamingResponse
)
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel
import scanner as sc
app = FastAPI(title="Duplicate Finder")
# Resolve paths relative to this file so it works both in Docker and locally
_BASE = Path(__file__).parent
_TEMPLATES_DIR = (
str(_BASE / "templates") if (_BASE / "templates").exists()
else str(_BASE.parent / "templates") if (_BASE.parent / "templates").exists()
else "/app/templates"
)
_STATIC_DIR = str(_BASE / "static")
_STATIC_DIR = _STATIC_DIR if Path(_STATIC_DIR).exists() else "/app/static"
# Ensure static dir exists
Path(_STATIC_DIR).mkdir(parents=True, exist_ok=True)
templates = Jinja2Templates(directory=_TEMPLATES_DIR)
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
METHOD_META = {
"sha256": {"color": "#378ADD", "label": "Exact copy"},
"phash": {"color": "#9b7de8", "label": "Visual match"},
"exif": {"color": "#e2a43a", "label": "Same moment"},
"filesize": {"color": "#888780", "label": "Possible match"},
}
# ── Startup ───────────────────────────────────────────────────────────────────
@app.on_event("startup")
def startup():
sc.init_db()
# ── Frontend ──────────────────────────────────────────────────────────────────
@app.get("/")
def index(request: Request):
return templates.TemplateResponse("index.html", {"request": request})
# ── DB helper ─────────────────────────────────────────────────────────────────
def get_db() -> sqlite3.Connection:
return sc.get_db()
# ── Scan management ───────────────────────────────────────────────────────────
class ScanStartBody(BaseModel):
folder_path: str
mode: str = "incremental"
@app.post("/api/scan/start")
def scan_start(body: ScanStartBody):
if sc.scan_state["status"] == "running":
raise HTTPException(400, "A scan is already running")
mode = body.mode
if mode not in ("incremental", "full_reset", "new_files", "regroup"):
raise HTTPException(400, f"Unknown scan mode: {mode}")
if mode == "full_reset":
con = get_db()
cur = con.cursor()
cur.execute("DELETE FROM duplicate_members")
cur.execute("DELETE FROM duplicate_groups")
cur.execute("DELETE FROM files")
cur.execute("DELETE FROM scans")
con.commit()
con.close()
con = get_db()
cur = con.cursor()
cur.execute(
"INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
(body.folder_path,),
)
scan_id = cur.lastrowid
con.commit()
con.close()
sc.scan_state.update(
scan_id=scan_id,
status="running",
phase="takeout",
progress=0,
total=0,
message="Starting...",
pause_requested=False,
files_indexed=0,
phashes_done=0,
folder_path=body.folder_path,
stats={},
)
thread = threading.Thread(
target=sc.run_scan,
args=(body.folder_path, scan_id, mode),
daemon=True,
)
thread.start()
return {"scan_id": scan_id}
@app.get("/api/scan/status")
def scan_status():
state = sc.scan_state
con = get_db()
cur = con.cursor()
# Build group counts per method
stats = {}
for method in ("sha256", "phash", "exif", "filesize"):
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
stats[f"{method}_groups"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups")
stats["total_groups"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
stats["reviewed"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
stats["pending"] = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
stats["total_files"] = cur.fetchone()[0]
con.close()
return {
"scan_id": state["scan_id"],
"status": state["status"],
"phase": state["phase"],
"progress": state["progress"],
"total": state["total"],
"message": state["message"],
"folder_path": state.get("folder_path"),
"files_indexed": state.get("files_indexed", 0),
"phashes_done": state.get("phashes_done", 0),
"stats": stats,
}
@app.post("/api/scan/pause")
def scan_pause():
if sc.scan_state["status"] != "running":
raise HTTPException(400, "No scan is currently running")
sc.scan_state["pause_requested"] = True
return {"success": True}
# Keep /cancel as an alias so any lingering clients still work
@app.post("/api/scan/cancel")
def scan_cancel():
return scan_pause()
@app.post("/api/scan/resume")
def scan_resume():
if sc.scan_state["status"] != "paused":
raise HTTPException(400, "No paused scan to resume")
folder_path = sc.scan_state.get("folder_path")
if not folder_path:
raise HTTPException(400, "No folder path saved — please start a new scan")
con = get_db()
cur = con.cursor()
cur.execute(
"INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
(folder_path,),
)
scan_id = cur.lastrowid
con.commit()
con.close()
sc.scan_state.update(
scan_id=scan_id,
status="running",
phase="takeout",
progress=0,
total=0,
message="Resuming scan...",
pause_requested=False,
files_indexed=0,
phashes_done=0,
folder_path=folder_path,
stats={},
)
thread = threading.Thread(
target=sc.run_scan,
args=(folder_path, scan_id, "incremental"),
daemon=True,
)
thread.start()
return {"scan_id": scan_id}
@app.delete("/api/scan/reset")
def scan_reset(confirm: str = Query("")):
if confirm != "RESET":
raise HTTPException(400, "Pass ?confirm=RESET to confirm")
if sc.scan_state["status"] == "running":
raise HTTPException(
400, "A scan is currently running — pause it before resetting"
)
con = get_db()
cur = con.cursor()
cur.execute("DELETE FROM duplicate_members")
cur.execute("DELETE FROM duplicate_groups")
cur.execute("DELETE FROM files")
cur.execute("DELETE FROM scans")
con.commit()
con.close()
sc.scan_state.update(
scan_id=None, status="idle", phase="idle",
progress=0, total=0, message="",
pause_requested=False, files_indexed=0,
phashes_done=0, folder_path=None, stats={},
)
return {"success": True}
# ── Duplicate groups ──────────────────────────────────────────────────────────
@app.get("/api/groups")
def list_groups(
method: str = "all",
reviewed: str = "false",
sort: str = "count",
offset: int = 0,
limit: int = 50,
):
con = get_db()
cur = con.cursor()
where = []
params: list = []
if method != "all":
where.append("dg.method = ?")
params.append(method)
if reviewed == "false":
where.append("dg.reviewed = 0")
elif reviewed == "true":
where.append("dg.reviewed = 1")
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
order = {
"count": "member_count DESC",
"method": "dg.method, member_count DESC",
"date": "dg.created_at DESC",
}.get(sort, "member_count DESC")
cur.execute(f"""
SELECT COUNT(*) FROM duplicate_groups dg {where_clause}
""", params)
total = cur.fetchone()[0]
cur.execute(f"""
SELECT dg.id, dg.method, dg.reviewed,
COUNT(dm.id) as member_count,
(SELECT dm2.file_id FROM duplicate_members dm2
WHERE dm2.group_id = dg.id AND dm2.suggested = 1
LIMIT 1) as suggested_file_id
FROM duplicate_groups dg
LEFT JOIN duplicate_members dm ON dm.group_id = dg.id
{where_clause}
GROUP BY dg.id
ORDER BY {order}
LIMIT ? OFFSET ?
""", params + [limit, offset])
groups = []
for row in cur.fetchall():
meta = METHOD_META.get(row["method"], {"color": "#888", "label": row["method"]})
suggested = None
if row["suggested_file_id"]:
cur.execute(
"SELECT id, filename, width, height FROM files WHERE id=?",
(row["suggested_file_id"],),
)
f = cur.fetchone()
if f:
suggested = {
"file_id": f["id"],
"filename": f["filename"],
"width": f["width"],
"height": f["height"],
"thumb_url": f"/api/thumb/{f['id']}",
}
groups.append({
"id": row["id"],
"method": row["method"],
"method_color": meta["color"],
"method_label": meta["label"],
"member_count": row["member_count"],
"reviewed": bool(row["reviewed"]),
"suggested_keeper": suggested,
})
con.close()
return {"total": total, "groups": groups}
@app.get("/api/groups/{group_id}")
def get_group(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT * FROM duplicate_groups WHERE id=?", (group_id,))
grp = cur.fetchone()
if not grp:
raise HTTPException(404, "Group not found")
meta = METHOD_META.get(grp["method"], {"color": "#888", "label": grp["method"]})
cur.execute("""
SELECT f.id, f.filename, f.path, f.file_size, f.width, f.height,
f.mime_type, f.exif_datetime, f.exif_device,
f.is_takeout, f.is_edited,
dm.is_keeper, dm.suggested
FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id
WHERE dm.group_id = ?
ORDER BY dm.suggested DESC, f.width * f.height DESC
""", (group_id,))
members = []
for r in cur.fetchall():
members.append({
"file_id": r["id"],
"filename": r["filename"],
"path": r["path"],
"file_size": r["file_size"],
"width": r["width"],
"height": r["height"],
"mime_type": r["mime_type"],
"exif_datetime": r["exif_datetime"],
"exif_device": r["exif_device"],
"is_takeout": bool(r["is_takeout"]),
"is_edited": bool(r["is_edited"]) if r["is_edited"] is not None else False,
"is_suggested": bool(r["suggested"]),
"is_keeper": bool(r["is_keeper"]),
"thumb_url": f"/api/thumb/{r['id']}",
})
con.close()
return {
"id": grp["id"],
"method": grp["method"],
"method_color": meta["color"],
"method_label": meta["label"],
"method_value": grp["method_value"],
"reviewed": bool(grp["reviewed"]),
"members": members,
}
# ── Decisions ─────────────────────────────────────────────────────────────────
class DecideBody(BaseModel):
keeper_file_id: int
@app.post("/api/groups/{group_id}/decide")
def decide(group_id: int, body: DecideBody):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
all_members = [r["file_id"] for r in cur.fetchall()]
if body.keeper_file_id not in all_members:
raise HTTPException(400, "keeper_file_id is not a member of this group")
for fid in all_members:
is_k = 1 if fid == body.keeper_file_id else 0
cur.execute(
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
(is_k, group_id, fid),
)
status = "keeper" if is_k else "redundant"
cur.execute("UPDATE files SET status=? WHERE id=?", (status, fid))
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/{group_id}/skip")
def skip_group(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/{group_id}/keep-all")
def keep_all(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
for r in cur.fetchall():
cur.execute(
"UPDATE duplicate_members SET is_keeper=1 WHERE group_id=? AND file_id=?",
(group_id, r["file_id"]),
)
cur.execute("UPDATE files SET status='keeper' WHERE id=?", (r["file_id"],))
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/{group_id}/unreview")
def unreview_group(group_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
if not cur.fetchone():
raise HTTPException(404, "Group not found")
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
for r in cur.fetchall():
cur.execute(
"UPDATE duplicate_members SET is_keeper=0 WHERE group_id=? AND file_id=?",
(group_id, r["file_id"]),
)
cur.execute("UPDATE files SET status='pending' WHERE id=?", (r["file_id"],))
cur.execute("UPDATE duplicate_groups SET reviewed=0 WHERE id=?", (group_id,))
con.commit()
con.close()
return {"success": True}
@app.post("/api/groups/auto-resolve-exact")
def auto_resolve_exact():
con = get_db()
cur = con.cursor()
cur.execute("""
SELECT id FROM duplicate_groups
WHERE method='sha256' AND reviewed=0
""")
groups = [r["id"] for r in cur.fetchall()]
resolved = 0
for gid in groups:
cur.execute("""
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
FROM duplicate_members dm
JOIN files f ON f.id = dm.file_id
WHERE dm.group_id = ?
""", (gid,))
members = [dict(r) for r in cur.fetchall()]
if not members:
continue
keeper_id = sc._suggested_keeper_by_resolution(members)
for m in members:
is_k = 1 if m["id"] == keeper_id else 0
cur.execute(
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
(is_k, gid, m["id"]),
)
cur.execute(
"UPDATE files SET status=? WHERE id=?",
("keeper" if is_k else "redundant", m["id"]),
)
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (gid,))
resolved += 1
con.commit()
con.close()
return {"resolved": resolved}
# ── Files + thumbnails ────────────────────────────────────────────────────────
VIDEO_PLACEHOLDER_SVG = """<svg xmlns="http://www.w3.org/2000/svg" width="200" height="200"
viewBox="0 0 200 200">
<rect width="200" height="200" fill="#1e1e2e"/>
<polygon points="75,55 75,145 145,100" fill="#9b7de8"/>
</svg>"""
VIDEO_EXT = {".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", ".wmv", ".mts", ".m2ts"}
@app.get("/api/thumb/{file_id}")
def get_thumb(file_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT path, mime_type, extension FROM files WHERE id=?", (file_id,))
row = cur.fetchone()
con.close()
if not row:
raise HTTPException(404, "File not found")
path = row["path"]
ext = (row["extension"] or "").lower()
if not os.path.isfile(path):
raise HTTPException(404, "File not on disk")
if ext in VIDEO_EXT:
# Try ffmpeg for first frame
try:
result = subprocess.run(
[
"ffmpeg", "-i", path,
"-vframes", "1", "-f", "image2", "-vcodec", "mjpeg",
"pipe:1",
],
capture_output=True, timeout=10,
)
if result.returncode == 0 and result.stdout:
return Response(content=result.stdout, media_type="image/jpeg")
except Exception:
pass
return Response(content=VIDEO_PLACEHOLDER_SVG, media_type="image/svg+xml")
# Serve photo directly
mime = row["mime_type"] or "application/octet-stream"
return FileResponse(path, media_type=mime)
@app.get("/api/files/{file_id}")
def get_file_meta(file_id: int):
con = get_db()
cur = con.cursor()
cur.execute("SELECT * FROM files WHERE id=?", (file_id,))
row = cur.fetchone()
con.close()
if not row:
raise HTTPException(404, "File not found")
return dict(row)
# ── Stats ─────────────────────────────────────────────────────────────────────
@app.get("/api/browse")
def browse(path: str = Query("/")):
"""List subdirectories at the given path for the folder picker."""
try:
p = Path(path).resolve()
except Exception:
raise HTTPException(400, "Invalid path")
if not p.exists() or not p.is_dir():
raise HTTPException(404, "Path not found")
dirs = []
try:
for entry in sorted(p.iterdir()):
if entry.is_dir() and not entry.name.startswith("."):
dirs.append(entry.name)
except PermissionError:
pass
parent = str(p.parent) if p != p.parent else None
return {
"current": str(p),
"parent": parent,
"dirs": dirs,
}
@app.get("/api/stats")
def get_stats():
con = get_db()
cur = con.cursor()
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status != 'error'")
r = cur.fetchone()
total_files = r[0] or 0
total_size = r[1] or 0
cur.execute("""
SELECT COUNT(*), SUM(f.file_size)
FROM files f
JOIN duplicate_members dm ON dm.file_id = f.id
WHERE dm.is_keeper = 0
""")
r = cur.fetchone()
dup_files = r[0] or 0
dup_size = r[1] or 0
by_method = {}
for method in ("sha256", "phash", "exif", "filesize"):
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
groups = cur.fetchone()[0]
cur.execute("""
SELECT COUNT(*) FROM duplicate_members dm
JOIN duplicate_groups dg ON dg.id = dm.group_id
WHERE dg.method = ?
""", (method,))
files = cur.fetchone()[0]
by_method[method] = {"groups": groups, "files": files}
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
reviewed = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
pending = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM files WHERE is_takeout=1")
takeout_files = cur.fetchone()[0]
con.close()
return {
"total_files": total_files,
"total_size_bytes": total_size,
"duplicate_files": dup_files,
"duplicate_size_bytes": dup_size,
"groups_by_method": by_method,
"reviewed": reviewed,
"pending": pending,
"takeout_files": takeout_files,
}
# ── Export ────────────────────────────────────────────────────────────────────
@app.get("/api/export/csv")
def export_csv():
con = get_db()
cur = con.cursor()
cur.execute("""
SELECT dg.id as group_id, dg.method, f.id as file_id,
f.path, f.filename, f.file_size,
f.width, f.height, f.exif_datetime, f.exif_device,
dm.is_keeper,
CASE WHEN dm.is_keeper=0 AND dg.reviewed=1 THEN 1 ELSE 0 END as is_redundant,
dg.reviewed
FROM duplicate_groups dg
JOIN duplicate_members dm ON dm.group_id = dg.id
JOIN files f ON f.id = dm.file_id
ORDER BY dg.id, dm.is_keeper DESC
""")
rows = cur.fetchall()
con.close()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow([
"group_id", "method", "file_id", "path", "filename",
"size", "width", "height", "exif_date", "device",
"is_keeper", "is_redundant", "reviewed",
])
for r in rows:
writer.writerow([
r["group_id"], r["method"], r["file_id"],
r["path"], r["filename"], r["file_size"],
r["width"], r["height"], r["exif_datetime"], r["exif_device"],
r["is_keeper"], r["is_redundant"], r["reviewed"],
])
output.seek(0)
return StreamingResponse(
iter([output.getvalue()]),
media_type="text/csv",
headers={"Content-Disposition": "attachment; filename=dup-finder-export.csv"},
)