commit 868da9016d6f59f6a92040c59a2fc7be50eece91 Author: tocmo Date: Sat Apr 4 23:42:58 2026 -0400 Initial implementation of duplicate finder Full project per spec: FastAPI backend, 4-method duplicate detection (SHA-256, phash, EXIF, filesize), Google Takeout pre-processor, 4 scan modes, and dark-theme vanilla JS gallery frontend. Co-Authored-By: Claude Sonnet 4.6 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b9f89f0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y \ + libheif-dev libjpeg-dev libpng-dev libtiff-dev libwebp-dev exiftool \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY app/ /app/ +COPY templates/ /app/templates/ +RUN mkdir -p /data /photos /app/static + +EXPOSE 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..deb128d --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Duplicate Finder + +A self-hosted Docker web app that scans a photo/video library, detects duplicates using four methods, and lets you review them in a gallery UI. **No files are ever moved, renamed, or deleted** — all decisions are recorded in SQLite only. + +## Quick start + +```bash +# 1. Edit docker-compose.yml — set your photos volume path +# 2. Build and run +docker compose up -d --build +# 3. Open http://localhost:8765 +# 4. Enter folder path in UI and click Scan +``` + +## Volume mounts + +| Container path | Purpose | +|---|---| +| `/photos` | Your photo library — mounted **read-only** | +| `/data` | SQLite database persistence | + +Edit `docker-compose.yml` to point these at your NAS paths. + +## Detection methods + +| Method | Color | Description | +|---|---|---| +| SHA-256 | Blue | Byte-identical files | +| Perceptual hash | Purple | Visually similar photos (hamming ≤ 10) | +| EXIF timestamp + device | Amber | Same camera, same moment | +| File size + dimensions | Gray | Same size and resolution (low confidence) | + +## Scan modes + +| Mode | Description | +|---|---| +| Incremental | Only re-hashes changed/new files. Prior decisions preserved. | +| New files only | Indexes newly added files. Existing decisions untouched. | +| Rebuild groups | Re-runs detection on existing index. No re-hashing. | +| Full reset | Wipes everything and scans from scratch. | + +## Google Takeout + +The scanner automatically detects Google Takeout folder structures and reads `.json` sidecar files to restore correct capture timestamps and original filenames. Takeout files are flagged in the UI. + +## What "redundant" means + +Marking a file redundant **only writes to the database**. Nothing is moved, renamed, or deleted. This tool produces a decision record only. A separate tool handles file actions. + +## Tech stack + +- Python 3.12, FastAPI, Uvicorn +- SQLite (stdlib `sqlite3`) +- Pillow, imagehash, pillow-heif +- Vanilla JS single-page frontend +- Docker / docker-compose diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..9a74274 --- /dev/null +++ b/app/main.py @@ -0,0 +1,599 @@ +""" +FastAPI application — all API routes for the duplicate finder. +""" + +import csv +import io +import os +import sqlite3 +import subprocess +import threading +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, HTTPException, Query, Request +from fastapi.responses import ( + FileResponse, JSONResponse, Response, StreamingResponse +) +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from pydantic import BaseModel + +import scanner as sc + +app = FastAPI(title="Duplicate Finder") +templates = Jinja2Templates(directory="/app/templates") + +app.mount("/static", StaticFiles(directory="/app/static"), name="static") + +METHOD_META = { + "sha256": {"color": "#378ADD", "label": "Exact copy"}, + "phash": {"color": "#9b7de8", "label": "Visual match"}, + "exif": {"color": "#e2a43a", "label": "Same moment"}, + "filesize": {"color": "#888780", "label": "Possible match"}, +} + +# ── Startup ─────────────────────────────────────────────────────────────────── + +@app.on_event("startup") +def startup(): + sc.init_db() + + +# ── Frontend ────────────────────────────────────────────────────────────────── + +@app.get("/") +def index(request: Request): + return templates.TemplateResponse("index.html", {"request": request}) + + +# ── DB helper ───────────────────────────────────────────────────────────────── + +def get_db() -> sqlite3.Connection: + return sc.get_db() + + +# ── Scan management ─────────────────────────────────────────────────────────── + +class ScanStartBody(BaseModel): + folder_path: str + mode: str = "incremental" + + +@app.post("/api/scan/start") +def scan_start(body: ScanStartBody): + if sc.scan_state["status"] == "running": + raise HTTPException(400, "A scan is already running") + + mode = body.mode + if mode not in ("incremental", "full_reset", "new_files", "regroup"): + raise HTTPException(400, f"Unknown scan mode: {mode}") + + if mode == "full_reset": + con = get_db() + cur = con.cursor() + cur.execute("DELETE FROM duplicate_members") + cur.execute("DELETE FROM duplicate_groups") + cur.execute("DELETE FROM files") + cur.execute("DELETE FROM scans") + con.commit() + con.close() + + con = get_db() + cur = con.cursor() + cur.execute( + "INSERT INTO scans (folder_path, status) VALUES (?, 'running')", + (body.folder_path,), + ) + scan_id = cur.lastrowid + con.commit() + con.close() + + sc.scan_state.update( + scan_id=scan_id, + status="running", + phase="discovery", + progress=0, + total=0, + message="Starting...", + cancel_requested=False, + stats={}, + ) + + thread = threading.Thread( + target=sc.run_scan, + args=(body.folder_path, scan_id, mode), + daemon=True, + ) + thread.start() + + return {"scan_id": scan_id} + + +@app.get("/api/scan/status") +def scan_status(): + state = sc.scan_state + con = get_db() + cur = con.cursor() + + # Build group counts per method + stats = {} + for method in ("sha256", "phash", "exif", "filesize"): + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,)) + stats[f"{method}_groups"] = cur.fetchone()[0] + + cur.execute("SELECT COUNT(*) FROM duplicate_groups") + stats["total_groups"] = cur.fetchone()[0] + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1") + stats["reviewed"] = cur.fetchone()[0] + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0") + stats["pending"] = cur.fetchone()[0] + cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'") + stats["total_files"] = cur.fetchone()[0] + con.close() + + return { + "scan_id": state["scan_id"], + "status": state["status"], + "phase": state["phase"], + "progress": state["progress"], + "total": state["total"], + "message": state["message"], + "stats": stats, + } + + +@app.post("/api/scan/cancel") +def scan_cancel(): + if sc.scan_state["status"] != "running": + raise HTTPException(400, "No scan is currently running") + sc.scan_state["cancel_requested"] = True + return {"success": True} + + +@app.delete("/api/scan/reset") +def scan_reset(confirm: str = Query("")): + if confirm != "RESET": + raise HTTPException(400, "Pass ?confirm=RESET to confirm") + con = get_db() + cur = con.cursor() + cur.execute("DELETE FROM duplicate_members") + cur.execute("DELETE FROM duplicate_groups") + cur.execute("DELETE FROM files") + cur.execute("DELETE FROM scans") + con.commit() + con.close() + sc.scan_state.update( + scan_id=None, status="idle", phase="idle", + progress=0, total=0, message="", stats={}, + ) + return {"success": True} + + +# ── Duplicate groups ────────────────────────────────────────────────────────── + +@app.get("/api/groups") +def list_groups( + method: str = "all", + reviewed: str = "false", + sort: str = "count", + offset: int = 0, + limit: int = 50, +): + con = get_db() + cur = con.cursor() + + where = [] + params: list = [] + + if method != "all": + where.append("dg.method = ?") + params.append(method) + + if reviewed == "false": + where.append("dg.reviewed = 0") + elif reviewed == "true": + where.append("dg.reviewed = 1") + + where_clause = ("WHERE " + " AND ".join(where)) if where else "" + + order = { + "count": "member_count DESC", + "method": "dg.method, member_count DESC", + "date": "dg.created_at DESC", + }.get(sort, "member_count DESC") + + cur.execute(f""" + SELECT COUNT(*) FROM duplicate_groups dg {where_clause} + """, params) + total = cur.fetchone()[0] + + cur.execute(f""" + SELECT dg.id, dg.method, dg.reviewed, + COUNT(dm.id) as member_count, + (SELECT dm2.file_id FROM duplicate_members dm2 + WHERE dm2.group_id = dg.id AND dm2.suggested = 1 + LIMIT 1) as suggested_file_id + FROM duplicate_groups dg + LEFT JOIN duplicate_members dm ON dm.group_id = dg.id + {where_clause} + GROUP BY dg.id + ORDER BY {order} + LIMIT ? OFFSET ? + """, params + [limit, offset]) + + groups = [] + for row in cur.fetchall(): + meta = METHOD_META.get(row["method"], {"color": "#888", "label": row["method"]}) + suggested = None + if row["suggested_file_id"]: + cur.execute( + "SELECT id, filename, width, height FROM files WHERE id=?", + (row["suggested_file_id"],), + ) + f = cur.fetchone() + if f: + suggested = { + "file_id": f["id"], + "filename": f["filename"], + "width": f["width"], + "height": f["height"], + "thumb_url": f"/api/thumb/{f['id']}", + } + groups.append({ + "id": row["id"], + "method": row["method"], + "method_color": meta["color"], + "method_label": meta["label"], + "member_count": row["member_count"], + "reviewed": bool(row["reviewed"]), + "suggested_keeper": suggested, + }) + + con.close() + return {"total": total, "groups": groups} + + +@app.get("/api/groups/{group_id}") +def get_group(group_id: int): + con = get_db() + cur = con.cursor() + + cur.execute("SELECT * FROM duplicate_groups WHERE id=?", (group_id,)) + grp = cur.fetchone() + if not grp: + raise HTTPException(404, "Group not found") + + meta = METHOD_META.get(grp["method"], {"color": "#888", "label": grp["method"]}) + + cur.execute(""" + SELECT f.id, f.filename, f.path, f.file_size, f.width, f.height, + f.mime_type, f.exif_datetime, f.exif_device, + f.is_takeout, f.is_edited, + dm.is_keeper, dm.suggested + FROM duplicate_members dm + JOIN files f ON f.id = dm.file_id + WHERE dm.group_id = ? + ORDER BY dm.suggested DESC, f.width * f.height DESC + """, (group_id,)) + + members = [] + for r in cur.fetchall(): + members.append({ + "file_id": r["id"], + "filename": r["filename"], + "path": r["path"], + "file_size": r["file_size"], + "width": r["width"], + "height": r["height"], + "mime_type": r["mime_type"], + "exif_datetime": r["exif_datetime"], + "exif_device": r["exif_device"], + "is_takeout": bool(r["is_takeout"]), + "is_edited": bool(r["is_edited"]) if r["is_edited"] is not None else False, + "is_suggested": bool(r["suggested"]), + "is_keeper": bool(r["is_keeper"]), + "thumb_url": f"/api/thumb/{r['id']}", + }) + + con.close() + return { + "id": grp["id"], + "method": grp["method"], + "method_color": meta["color"], + "method_label": meta["label"], + "method_value": grp["method_value"], + "reviewed": bool(grp["reviewed"]), + "members": members, + } + + +# ── Decisions ───────────────────────────────────────────────────────────────── + +class DecideBody(BaseModel): + keeper_file_id: int + + +@app.post("/api/groups/{group_id}/decide") +def decide(group_id: int, body: DecideBody): + con = get_db() + cur = con.cursor() + + cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,)) + if not cur.fetchone(): + raise HTTPException(404, "Group not found") + + cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,)) + all_members = [r["file_id"] for r in cur.fetchall()] + + if body.keeper_file_id not in all_members: + raise HTTPException(400, "keeper_file_id is not a member of this group") + + for fid in all_members: + is_k = 1 if fid == body.keeper_file_id else 0 + cur.execute( + "UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?", + (is_k, group_id, fid), + ) + status = "keeper" if is_k else "redundant" + cur.execute("UPDATE files SET status=? WHERE id=?", (status, fid)) + + cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,)) + con.commit() + con.close() + return {"success": True} + + +@app.post("/api/groups/{group_id}/skip") +def skip_group(group_id: int): + con = get_db() + cur = con.cursor() + cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,)) + if not cur.fetchone(): + raise HTTPException(404, "Group not found") + cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,)) + con.commit() + con.close() + return {"success": True} + + +@app.post("/api/groups/{group_id}/keep-all") +def keep_all(group_id: int): + con = get_db() + cur = con.cursor() + cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,)) + if not cur.fetchone(): + raise HTTPException(404, "Group not found") + cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,)) + for r in cur.fetchall(): + cur.execute( + "UPDATE duplicate_members SET is_keeper=1 WHERE group_id=? AND file_id=?", + (group_id, r["file_id"]), + ) + cur.execute("UPDATE files SET status='keeper' WHERE id=?", (r["file_id"],)) + cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,)) + con.commit() + con.close() + return {"success": True} + + +@app.post("/api/groups/{group_id}/unreview") +def unreview_group(group_id: int): + con = get_db() + cur = con.cursor() + cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,)) + if not cur.fetchone(): + raise HTTPException(404, "Group not found") + cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,)) + for r in cur.fetchall(): + cur.execute( + "UPDATE duplicate_members SET is_keeper=0 WHERE group_id=? AND file_id=?", + (group_id, r["file_id"]), + ) + cur.execute("UPDATE files SET status='pending' WHERE id=?", (r["file_id"],)) + cur.execute("UPDATE duplicate_groups SET reviewed=0 WHERE id=?", (group_id,)) + con.commit() + con.close() + return {"success": True} + + +@app.post("/api/groups/auto-resolve-exact") +def auto_resolve_exact(): + con = get_db() + cur = con.cursor() + cur.execute(""" + SELECT id FROM duplicate_groups + WHERE method='sha256' AND reviewed=0 + """) + groups = [r["id"] for r in cur.fetchall()] + resolved = 0 + + for gid in groups: + cur.execute(""" + SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime + FROM duplicate_members dm + JOIN files f ON f.id = dm.file_id + WHERE dm.group_id = ? + """, (gid,)) + members = [dict(r) for r in cur.fetchall()] + if not members: + continue + + keeper_id = sc._suggested_keeper_by_resolution(members) + for m in members: + is_k = 1 if m["id"] == keeper_id else 0 + cur.execute( + "UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?", + (is_k, gid, m["id"]), + ) + cur.execute( + "UPDATE files SET status=? WHERE id=?", + ("keeper" if is_k else "redundant", m["id"]), + ) + cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (gid,)) + resolved += 1 + + con.commit() + con.close() + return {"resolved": resolved} + + +# ── Files + thumbnails ──────────────────────────────────────────────────────── + +VIDEO_PLACEHOLDER_SVG = """ + + +""" + +VIDEO_EXT = {".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", ".wmv", ".mts", ".m2ts"} + + +@app.get("/api/thumb/{file_id}") +def get_thumb(file_id: int): + con = get_db() + cur = con.cursor() + cur.execute("SELECT path, mime_type, extension FROM files WHERE id=?", (file_id,)) + row = cur.fetchone() + con.close() + + if not row: + raise HTTPException(404, "File not found") + + path = row["path"] + ext = (row["extension"] or "").lower() + + if not os.path.isfile(path): + raise HTTPException(404, "File not on disk") + + if ext in VIDEO_EXT: + # Try ffmpeg for first frame + try: + result = subprocess.run( + [ + "ffmpeg", "-i", path, + "-vframes", "1", "-f", "image2", "-vcodec", "mjpeg", + "pipe:1", + ], + capture_output=True, timeout=10, + ) + if result.returncode == 0 and result.stdout: + return Response(content=result.stdout, media_type="image/jpeg") + except Exception: + pass + return Response(content=VIDEO_PLACEHOLDER_SVG, media_type="image/svg+xml") + + # Serve photo directly + mime = row["mime_type"] or "application/octet-stream" + return FileResponse(path, media_type=mime) + + +@app.get("/api/files/{file_id}") +def get_file_meta(file_id: int): + con = get_db() + cur = con.cursor() + cur.execute("SELECT * FROM files WHERE id=?", (file_id,)) + row = cur.fetchone() + con.close() + if not row: + raise HTTPException(404, "File not found") + return dict(row) + + +# ── Stats ───────────────────────────────────────────────────────────────────── + +@app.get("/api/stats") +def get_stats(): + con = get_db() + cur = con.cursor() + + cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status != 'error'") + r = cur.fetchone() + total_files = r[0] or 0 + total_size = r[1] or 0 + + cur.execute(""" + SELECT COUNT(*), SUM(f.file_size) + FROM files f + JOIN duplicate_members dm ON dm.file_id = f.id + WHERE dm.is_keeper = 0 + """) + r = cur.fetchone() + dup_files = r[0] or 0 + dup_size = r[1] or 0 + + by_method = {} + for method in ("sha256", "phash", "exif", "filesize"): + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,)) + groups = cur.fetchone()[0] + cur.execute(""" + SELECT COUNT(*) FROM duplicate_members dm + JOIN duplicate_groups dg ON dg.id = dm.group_id + WHERE dg.method = ? + """, (method,)) + files = cur.fetchone()[0] + by_method[method] = {"groups": groups, "files": files} + + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1") + reviewed = cur.fetchone()[0] + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0") + pending = cur.fetchone()[0] + + cur.execute("SELECT COUNT(*) FROM files WHERE is_takeout=1") + takeout_files = cur.fetchone()[0] + + con.close() + return { + "total_files": total_files, + "total_size_bytes": total_size, + "duplicate_files": dup_files, + "duplicate_size_bytes": dup_size, + "groups_by_method": by_method, + "reviewed": reviewed, + "pending": pending, + "takeout_files": takeout_files, + } + + +# ── Export ──────────────────────────────────────────────────────────────────── + +@app.get("/api/export/csv") +def export_csv(): + con = get_db() + cur = con.cursor() + cur.execute(""" + SELECT dg.id as group_id, dg.method, f.id as file_id, + f.path, f.filename, f.file_size, + f.width, f.height, f.exif_datetime, f.exif_device, + dm.is_keeper, + CASE WHEN dm.is_keeper=0 AND dg.reviewed=1 THEN 1 ELSE 0 END as is_redundant, + dg.reviewed + FROM duplicate_groups dg + JOIN duplicate_members dm ON dm.group_id = dg.id + JOIN files f ON f.id = dm.file_id + ORDER BY dg.id, dm.is_keeper DESC + """) + rows = cur.fetchall() + con.close() + + output = io.StringIO() + writer = csv.writer(output) + writer.writerow([ + "group_id", "method", "file_id", "path", "filename", + "size", "width", "height", "exif_date", "device", + "is_keeper", "is_redundant", "reviewed", + ]) + for r in rows: + writer.writerow([ + r["group_id"], r["method"], r["file_id"], + r["path"], r["filename"], r["file_size"], + r["width"], r["height"], r["exif_datetime"], r["exif_device"], + r["is_keeper"], r["is_redundant"], r["reviewed"], + ]) + + output.seek(0) + return StreamingResponse( + iter([output.getvalue()]), + media_type="text/csv", + headers={"Content-Disposition": "attachment; filename=dup-finder-export.csv"}, + ) diff --git a/app/scanner.py b/app/scanner.py new file mode 100644 index 0000000..108db0d --- /dev/null +++ b/app/scanner.py @@ -0,0 +1,758 @@ +""" +File scanner: discovery, per-file extraction, and all 4 duplicate detection passes. +""" + +import hashlib +import mimetypes +import os +import sqlite3 +import subprocess +from datetime import datetime +from pathlib import Path + +import imagehash +from PIL import Image, ExifTags, UnidentifiedImageError + +try: + from pillow_heif import register_heif_opener + register_heif_opener() +except ImportError: + pass + +from takeout import is_takeout_folder, process_takeout + + +PHOTO_EXT = { + ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", + ".webp", ".heic", ".heif", ".raw", ".cr2", ".nef", ".arw", + ".dng", ".orf", ".rw2", ".pef", ".srw", ".x3f", +} + +VIDEO_EXT = { + ".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", + ".wmv", ".mts", ".m2ts", +} + +SUPPORTED_EXT = PHOTO_EXT | VIDEO_EXT + +DB_PATH = "/data/dupfinder.db" + +# Shared scan state (updated by background thread, read by status endpoint) +scan_state = { + "scan_id": None, + "status": "idle", # idle | running | complete | error | cancelled + "phase": "idle", # discovery | takeout | indexing | phash | grouping | done + "progress": 0, + "total": 0, + "message": "", + "cancel_requested": False, + "stats": {}, +} + + +# ── DB helpers ──────────────────────────────────────────────────────────────── + +def get_db() -> sqlite3.Connection: + con = sqlite3.connect(DB_PATH, timeout=30) + con.row_factory = sqlite3.Row + con.execute("PRAGMA journal_mode=WAL") + con.execute("PRAGMA foreign_keys=ON") + return con + + +def init_db(): + con = get_db() + cur = con.cursor() + cur.executescript(""" + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + path TEXT UNIQUE NOT NULL, + filename TEXT NOT NULL, + extension TEXT, + file_size INTEGER, + mime_type TEXT, + sha256 TEXT, + phash TEXT, + exif_datetime TEXT, + exif_device TEXT, + width INTEGER, + height INTEGER, + is_takeout INTEGER DEFAULT 0, + is_edited INTEGER DEFAULT 0, + takeout_json TEXT, + scan_id INTEGER, + status TEXT DEFAULT 'pending', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS scans ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + folder_path TEXT NOT NULL, + started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + completed_at TIMESTAMP, + total_files INTEGER DEFAULT 0, + status TEXT DEFAULT 'running' + ); + + CREATE TABLE IF NOT EXISTS duplicate_groups ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + method TEXT NOT NULL, + method_value TEXT, + reviewed INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS duplicate_members ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + group_id INTEGER REFERENCES duplicate_groups(id) ON DELETE CASCADE, + file_id INTEGER REFERENCES files(id) ON DELETE CASCADE, + is_keeper INTEGER DEFAULT 0, + suggested INTEGER DEFAULT 0 + ); + + CREATE INDEX IF NOT EXISTS idx_sha256 ON files(sha256); + CREATE INDEX IF NOT EXISTS idx_phash ON files(phash); + CREATE INDEX IF NOT EXISTS idx_exif_dt ON files(exif_datetime, exif_device); + CREATE INDEX IF NOT EXISTS idx_size_dim ON files(file_size, width, height); + CREATE INDEX IF NOT EXISTS idx_status ON files(status); + """) + con.commit() + con.close() + + +# ── Per-file extraction ─────────────────────────────────────────────────────── + +def _sha256(path: str) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + while chunk := f.read(65536): + h.update(chunk) + return h.hexdigest() + + +def _exif_data(path: str) -> tuple[str | None, str | None]: + """Returns (exif_datetime, exif_device) or (None, None).""" + try: + img = Image.open(path) + exif_raw = img._getexif() + if not exif_raw: + return None, None + exif = {ExifTags.TAGS.get(k, k): v for k, v in exif_raw.items()} + dt = exif.get("DateTimeOriginal") or exif.get("DateTime") + if dt: + try: + dt = datetime.strptime(dt, "%Y:%m:%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S") + except ValueError: + dt = None + make = str(exif.get("Make", "")).strip() + model = str(exif.get("Model", "")).strip() + device = (make + " " + model).strip() if (make or model) else None + return dt, device + except Exception: + return None, None + + +def _image_dims(path: str) -> tuple[int | None, int | None]: + try: + with Image.open(path) as img: + return img.size # (width, height) + except Exception: + return None, None + + +def _phash(path: str) -> str | None: + try: + with Image.open(path) as img: + return str(imagehash.phash(img)) + except Exception: + return None + + +def _video_dims(path: str) -> tuple[int | None, int | None]: + try: + result = subprocess.run( + [ + "ffprobe", "-v", "error", + "-select_streams", "v:0", + "-show_entries", "stream=width,height", + "-of", "csv=p=0", + path, + ], + capture_output=True, text=True, timeout=10, + ) + parts = result.stdout.strip().split(",") + if len(parts) == 2: + return int(parts[0]), int(parts[1]) + except Exception: + pass + return None, None + + +def _mtime_str(path: str) -> str | None: + try: + ts = os.path.getmtime(path) + return datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S") + except Exception: + return None + + +def extract_file(path: str) -> dict: + ext = Path(path).suffix.lower() + filename = Path(path).name + is_photo = ext in PHOTO_EXT + is_video = ext in VIDEO_EXT + + record = { + "path": path, + "filename": filename, + "extension": ext, + "file_size": None, + "mime_type": None, + "sha256": None, + "phash": None, + "exif_datetime": None, + "exif_device": None, + "width": None, + "height": None, + } + + try: + record["file_size"] = os.path.getsize(path) + except OSError: + pass + + record["mime_type"] = mimetypes.guess_type(path)[0] + + try: + record["sha256"] = _sha256(path) + except OSError: + pass + + if is_photo: + w, h = _image_dims(path) + record["width"], record["height"] = w, h + dt, device = _exif_data(path) + record["exif_datetime"] = dt or _mtime_str(path) + record["exif_device"] = device + # phash computed in separate phase for progress reporting + + elif is_video: + w, h = _video_dims(path) + record["width"], record["height"] = w, h + record["exif_datetime"] = _mtime_str(path) + + return record + + +# ── Union-Find for phash grouping ──────────────────────────────────────────── + +class UnionFind: + def __init__(self): + self.parent: dict[int, int] = {} + + def find(self, x: int) -> int: + if x not in self.parent: + self.parent[x] = x + if self.parent[x] != x: + self.parent[x] = self.find(self.parent[x]) + return self.parent[x] + + def union(self, x: int, y: int): + px, py = self.find(x), self.find(y) + if px != py: + self.parent[px] = py + + def groups(self) -> dict[int, list[int]]: + from collections import defaultdict + result: dict[int, list[int]] = defaultdict(list) + for x in self.parent: + result[self.find(x)].append(x) + return {k: v for k, v in result.items() if len(v) >= 2} + + +# ── Detection passes ────────────────────────────────────────────────────────── + +def _suggested_keeper_by_resolution(members: list[dict]) -> int: + """Return file_id of highest resolution member; tie-break by size then oldest date.""" + def score(m): + w = m["width"] or 0 + h = m["height"] or 0 + size = m["file_size"] or 0 + dt = m["exif_datetime"] or "9999" + return (w * h, size, dt) + + best = max(members, key=lambda m: ( + (m["width"] or 0) * (m["height"] or 0), + m["file_size"] or 0, + # older date = better; invert by negating epoch or use str comparison inverted + )) + return best["id"] + + +def _suggested_keeper_oldest(members: list[dict]) -> int: + def key(m): + return m["exif_datetime"] or "9999" + return min(members, key=key)["id"] + + +def _run_sha256_pass(con: sqlite3.Connection, scan_id: int): + cur = con.cursor() + cur.execute(""" + SELECT sha256, COUNT(*) as cnt + FROM files + WHERE sha256 IS NOT NULL + GROUP BY sha256 + HAVING cnt > 1 + """) + rows = cur.fetchall() + for row in rows: + sha = row["sha256"] + cur.execute(""" + SELECT id, width, height, file_size, exif_datetime + FROM files WHERE sha256 = ? + """, (sha,)) + members = [dict(r) for r in cur.fetchall()] + + keeper_id = _suggested_keeper_by_resolution(members) + + cur.execute( + "INSERT INTO duplicate_groups (method, method_value) VALUES ('sha256', ?)", + (sha,), + ) + group_id = cur.lastrowid + for m in members: + cur.execute( + "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)", + (group_id, m["id"], 1 if m["id"] == keeper_id else 0), + ) + + +def _run_phash_pass(con: sqlite3.Connection, scan_id: int): + cur = con.cursor() + # Exclude files already in sha256 groups + cur.execute(""" + SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime + FROM files f + WHERE f.phash IS NOT NULL + AND f.extension NOT IN ( + '.mp4','.mov','.avi','.mkv','.m4v','.3gp','.wmv','.mts','.m2ts' + ) + AND f.id NOT IN ( + SELECT dm.file_id FROM duplicate_members dm + JOIN duplicate_groups dg ON dg.id = dm.group_id + WHERE dg.method = 'sha256' + ) + """) + rows = [dict(r) for r in cur.fetchall()] + + if len(rows) < 2: + return + + # Bucket by first 2 hex chars to reduce O(n²) comparisons + buckets: dict[str, list[dict]] = {} + for r in rows: + key = r["phash"][:2] + buckets.setdefault(key, []).append(r) + + uf = UnionFind() + # Ensure all IDs are registered + for r in rows: + uf.find(r["id"]) + + THRESHOLD = 10 + for bucket in buckets.values(): + for i in range(len(bucket)): + for j in range(i + 1, len(bucket)): + a, b = bucket[i], bucket[j] + try: + dist = imagehash.hex_to_hash(a["phash"]) - imagehash.hex_to_hash(b["phash"]) + if dist <= THRESHOLD: + uf.union(a["id"], b["id"]) + except Exception: + pass + + id_map = {r["id"]: r for r in rows} + for _, member_ids in uf.groups().items(): + members = [id_map[mid] for mid in member_ids if mid in id_map] + if len(members) < 2: + continue + keeper_id = _suggested_keeper_by_resolution(members) + keeper = id_map[keeper_id] + cur.execute( + "INSERT INTO duplicate_groups (method, method_value) VALUES ('phash', ?)", + (keeper["phash"],), + ) + group_id = cur.lastrowid + for m in members: + cur.execute( + "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)", + (group_id, m["id"], 1 if m["id"] == keeper_id else 0), + ) + + +def _run_exif_pass(con: sqlite3.Connection, scan_id: int): + cur = con.cursor() + cur.execute(""" + SELECT exif_datetime, exif_device, COUNT(*) as cnt + FROM files + WHERE exif_datetime IS NOT NULL + AND exif_device IS NOT NULL + AND id NOT IN ( + SELECT file_id FROM duplicate_members dm + JOIN duplicate_groups dg ON dg.id = dm.group_id + WHERE dg.method IN ('sha256', 'phash') + ) + GROUP BY exif_datetime, exif_device + HAVING cnt > 1 + """) + rows = cur.fetchall() + for row in rows: + dt, dev = row["exif_datetime"], row["exif_device"] + cur.execute(""" + SELECT id, width, height, file_size, exif_datetime + FROM files + WHERE exif_datetime = ? AND exif_device = ? + """, (dt, dev)) + members = [dict(r) for r in cur.fetchall()] + keeper_id = _suggested_keeper_by_resolution(members) + method_value = f"{dt}::{dev}" + cur.execute( + "INSERT INTO duplicate_groups (method, method_value) VALUES ('exif', ?)", + (method_value,), + ) + group_id = cur.lastrowid + for m in members: + cur.execute( + "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)", + (group_id, m["id"], 1 if m["id"] == keeper_id else 0), + ) + + +def _run_filesize_pass(con: sqlite3.Connection, scan_id: int): + cur = con.cursor() + cur.execute(""" + SELECT file_size, width, height, COUNT(*) as cnt + FROM files + WHERE file_size IS NOT NULL + AND width IS NOT NULL + AND height IS NOT NULL + AND id NOT IN ( + SELECT file_id FROM duplicate_members dm + JOIN duplicate_groups dg ON dg.id = dm.group_id + WHERE dg.method IN ('sha256', 'phash', 'exif') + ) + GROUP BY file_size, width, height + HAVING cnt > 1 + """) + rows = cur.fetchall() + for row in rows: + fs, w, h = row["file_size"], row["width"], row["height"] + cur.execute(""" + SELECT id, width, height, file_size, exif_datetime + FROM files + WHERE file_size = ? AND width = ? AND height = ? + """, (fs, w, h)) + members = [dict(r) for r in cur.fetchall()] + keeper_id = _suggested_keeper_oldest(members) + method_value = f"{fs}::{w}x{h}" + cur.execute( + "INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)", + (method_value,), + ) + group_id = cur.lastrowid + for m in members: + cur.execute( + "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)", + (group_id, m["id"], 1 if m["id"] == keeper_id else 0), + ) + + +# ── Main scan entry point ───────────────────────────────────────────────────── + +def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): + """Main scan function — runs in background thread.""" + global scan_state + con = get_db() + cur = con.cursor() + + try: + # ── Phase: discovery ────────────────────────────────────────────── + scan_state.update(phase="discovery", progress=0, total=0, + message="Discovering files...") + + all_files = [] + for root, dirs, files in os.walk(folder_path): + dirs[:] = [d for d in dirs if not d.startswith(".")] + for fname in files: + if fname.endswith(".json"): + continue + ext = Path(fname).suffix.lower() + if ext in SUPPORTED_EXT: + all_files.append(os.path.join(root, fname)) + + scan_state["total"] = len(all_files) + scan_state["message"] = f"Found {len(all_files):,} files" + + if scan_state["cancel_requested"]: + _mark_scan(cur, scan_id, "cancelled") + con.commit() + scan_state["status"] = "cancelled" + return + + # ── Mode: full reset ────────────────────────────────────────────── + if mode == "full_reset": + cur.execute("DELETE FROM duplicate_members") + cur.execute("DELETE FROM duplicate_groups") + cur.execute("DELETE FROM files") + con.commit() + + # ── Phase: takeout pre-processing ───────────────────────────────── + scan_state.update(phase="takeout", message="Checking for Google Takeout structure...") + if is_takeout_folder(folder_path): + scan_state["message"] = "Processing Google Takeout sidecars..." + process_takeout(folder_path, DB_PATH) + + if scan_state["cancel_requested"]: + _mark_scan(cur, scan_id, "cancelled") + con.commit() + scan_state["status"] = "cancelled" + return + + # ── Phase: indexing ─────────────────────────────────────────────── + scan_state.update(phase="indexing", progress=0, + message="Indexing files (SHA-256 + EXIF + dimensions)...") + + for i, path in enumerate(all_files): + if scan_state["cancel_requested"]: + _mark_scan(cur, scan_id, "cancelled") + con.commit() + scan_state["status"] = "cancelled" + return + + scan_state["progress"] = i + 1 + scan_state["message"] = f"Indexing: {Path(path).name}" + + # Check existing record + cur.execute("SELECT id, file_size, updated_at FROM files WHERE path = ?", (path,)) + existing = cur.fetchone() + + try: + current_size = os.path.getsize(path) + except OSError: + continue + + if existing and mode in ("incremental", "new_files"): + if mode == "new_files": + # Skip entirely — don't re-hash existing files + cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path)) + continue + # Incremental: skip if size unchanged (use size as proxy for change) + if existing["file_size"] == current_size: + cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path)) + continue + # File changed — re-hash, clear group memberships + cur.execute( + "DELETE FROM duplicate_members WHERE file_id = ?", (existing["id"],) + ) + + try: + record = extract_file(path) + except Exception as e: + cur.execute( + "INSERT OR IGNORE INTO files (path, filename, extension, scan_id, status) " + "VALUES (?, ?, ?, ?, 'error')", + (path, Path(path).name, Path(path).suffix.lower(), scan_id), + ) + cur.execute( + "UPDATE files SET status='error', scan_id=?, updated_at=CURRENT_TIMESTAMP " + "WHERE path=?", + (scan_id, path), + ) + con.commit() + continue + + record["scan_id"] = scan_id + if existing: + cur.execute(""" + UPDATE files SET + filename=:filename, extension=:extension, file_size=:file_size, + mime_type=:mime_type, sha256=:sha256, + exif_datetime=:exif_datetime, exif_device=:exif_device, + width=:width, height=:height, scan_id=:scan_id, + status='pending', updated_at=CURRENT_TIMESTAMP + WHERE path=:path + """, record) + else: + cur.execute(""" + INSERT OR IGNORE INTO files + (path, filename, extension, file_size, mime_type, sha256, + exif_datetime, exif_device, width, height, scan_id, status) + VALUES + (:path, :filename, :extension, :file_size, :mime_type, :sha256, + :exif_datetime, :exif_device, :width, :height, :scan_id, 'pending') + """, record) + + if (i + 1) % 100 == 0: + con.commit() + + con.commit() + + # ── Phase: phash ────────────────────────────────────────────────── + scan_state.update(phase="phash", progress=0, + message="Computing perceptual hashes...") + + cur.execute(""" + SELECT id, path FROM files + WHERE extension IN ( + '.jpg','.jpeg','.png','.gif','.bmp','.tiff','.tif', + '.webp','.heic','.heif','.raw','.cr2','.nef','.arw', + '.dng','.orf','.rw2','.pef','.srw','.x3f' + ) AND phash IS NULL AND status != 'error' + """) + photo_rows = cur.fetchall() + scan_state["total"] = len(photo_rows) + + for i, row in enumerate(photo_rows): + if scan_state["cancel_requested"]: + _mark_scan(cur, scan_id, "cancelled") + con.commit() + scan_state["status"] = "cancelled" + return + + scan_state["progress"] = i + 1 + scan_state["message"] = f"Phash: {Path(row['path']).name}" + ph = _phash(row["path"]) + if ph: + cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"])) + if (i + 1) % 200 == 0: + con.commit() + + con.commit() + + # ── Phase: grouping ─────────────────────────────────────────────── + scan_state.update(phase="grouping", progress=0, total=4, + message="Running duplicate detection...") + + if mode in ("incremental", "full_reset", "regroup"): + cur.execute("DELETE FROM duplicate_members") + cur.execute("DELETE FROM duplicate_groups") + con.commit() + elif mode == "new_files": + # Only clear groups containing new files + cur.execute(""" + DELETE FROM duplicate_groups WHERE id IN ( + SELECT DISTINCT dm.group_id FROM duplicate_members dm + JOIN files f ON f.id = dm.file_id + WHERE f.scan_id = ? + ) + """, (scan_id,)) + con.commit() + + scan_state["message"] = "Pass 1/4: SHA-256 exact duplicates..." + _run_sha256_pass(con, scan_id) + scan_state["progress"] = 1 + con.commit() + + scan_state["message"] = "Pass 2/4: Perceptual hash similarity..." + _run_phash_pass(con, scan_id) + scan_state["progress"] = 2 + con.commit() + + scan_state["message"] = "Pass 3/4: EXIF timestamp + device..." + _run_exif_pass(con, scan_id) + scan_state["progress"] = 3 + con.commit() + + scan_state["message"] = "Pass 4/4: File size + dimensions..." + _run_filesize_pass(con, scan_id) + scan_state["progress"] = 4 + con.commit() + + # ── Restore keeper statuses for mode=incremental ────────────────── + if mode == "incremental": + # If a previously marked keeper no longer appears in any group, reset to pending + cur.execute(""" + UPDATE files SET status='pending' + WHERE status='keeper' + AND id NOT IN ( + SELECT file_id FROM duplicate_members WHERE is_keeper=1 + ) + """) + con.commit() + + # Update scan record + cur.execute( + "UPDATE scans SET completed_at=CURRENT_TIMESTAMP, total_files=?, status='complete' " + "WHERE id=?", + (len(all_files), scan_id), + ) + con.commit() + + scan_state.update(status="complete", phase="done", + message="Scan complete.", progress=scan_state["total"]) + _update_stats() + + except Exception as e: + scan_state.update(status="error", message=str(e)) + try: + _mark_scan(cur, scan_id, "error") + con.commit() + except Exception: + pass + finally: + con.close() + + +def _mark_scan(cur, scan_id: int, status: str): + cur.execute( + "UPDATE scans SET completed_at=CURRENT_TIMESTAMP, status=? WHERE id=?", + (status, scan_id), + ) + + +def _update_stats(): + """Refresh stats in scan_state.""" + try: + con = get_db() + cur = con.cursor() + cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'") + total_files = cur.fetchone()[0] + + cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status='redundant'") + r = cur.fetchone() + dup_count = r[0] or 0 + dup_size = r[1] or 0 + + for method in ("sha256", "phash", "exif", "filesize"): + cur.execute( + "SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,) + ) + cur.execute(""" + SELECT method, + COUNT(*) as groups, + (SELECT COUNT(*) FROM duplicate_members dm2 + JOIN duplicate_groups dg2 ON dg2.id=dm2.group_id + WHERE dg2.method=dg.method) as files + FROM duplicate_groups dg + GROUP BY method + """) + by_method = {r["method"]: {"groups": r["groups"], "files": r["files"]} + for r in cur.fetchall()} + + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1") + reviewed = cur.fetchone()[0] + cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0") + pending = cur.fetchone()[0] + + scan_state["stats"] = { + "total_files": total_files, + "duplicate_files": dup_count, + "duplicate_size_bytes": dup_size, + "groups_by_method": by_method, + "reviewed": reviewed, + "pending": pending, + } + con.close() + except Exception: + pass diff --git a/app/takeout.py b/app/takeout.py new file mode 100644 index 0000000..34c0c42 --- /dev/null +++ b/app/takeout.py @@ -0,0 +1,149 @@ +""" +Google Takeout pre-processor. +Detects Takeout folder structures, reads JSON sidecars, and enriches +the files table with corrected timestamps, normalized filenames, and +edit-version flags. +""" + +import json +import os +import re +import sqlite3 +from datetime import datetime, timezone +from pathlib import Path + + +# Google edit suffixes appended to filenames +EDIT_SUFFIXES = ("-edited", "-effects", "-smile", "-mix") + + +def _find_sidecar(media_path: str) -> str | None: + """Return path to the JSON sidecar for a media file, or None.""" + p = Path(media_path) + # Try filename.ext.json first, then filename.json + candidates = [ + str(p) + ".json", + str(p.with_suffix(".json")), + ] + for c in candidates: + if os.path.isfile(c): + return c + return None + + +def _strip_collision_suffix(filename: str) -> str: + """Strip Google's (1), (2) collision suffixes from a filename.""" + stem = Path(filename).stem + ext = Path(filename).suffix + cleaned = re.sub(r"\(\d+\)$", "", stem).rstrip() + return cleaned + ext + + +def _is_edited(filename: str) -> bool: + stem = Path(filename).stem.lower() + return any(stem.endswith(s) for s in EDIT_SUFFIXES) + + +def is_takeout_folder(folder_path: str) -> bool: + """ + Heuristic: walk folder looking for .json files whose names match + adjacent media files. If we find at least 5 such pairs, call it Takeout. + """ + count = 0 + for root, dirs, files in os.walk(folder_path): + # Skip hidden dirs + dirs[:] = [d for d in dirs if not d.startswith(".")] + file_set = set(files) + for f in files: + if not f.endswith(".json"): + continue + # Check if a media file exists that this could be a sidecar for + base = f[:-5] # strip .json + if base in file_set: + count += 1 + if count >= 5: + return True + return False + + +def process_takeout(folder_path: str, db_path: str) -> int: + """ + Walk folder_path, find all media files with JSON sidecars, + and enrich their DB records. Returns count of files enriched. + """ + con = sqlite3.connect(db_path) + con.row_factory = sqlite3.Row + cur = con.cursor() + + enriched = 0 + + for root, dirs, files in os.walk(folder_path): + dirs[:] = [d for d in dirs if not d.startswith(".")] + for fname in files: + if fname.endswith(".json"): + continue + media_path = os.path.join(root, fname) + sidecar = _find_sidecar(media_path) + if not sidecar: + continue + + try: + with open(sidecar, "r", encoding="utf-8") as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + continue + + # Extract fields from sidecar + photo_taken_ts = None + try: + ts = int(data["photoTakenTime"]["timestamp"]) + dt = datetime.fromtimestamp(ts, tz=timezone.utc) + photo_taken_ts = dt.strftime("%Y-%m-%dT%H:%M:%S") + except (KeyError, ValueError, TypeError): + pass + + title = data.get("title", "") + takeout_json_str = json.dumps(data) + + # Normalized filename: use title if present, else strip suffix from fname + if title: + normalized = _strip_collision_suffix(title) + else: + normalized = _strip_collision_suffix(fname) + + edited = _is_edited(fname) + + # Update the DB record for this file + updates = { + "is_takeout": 1, + "filename": normalized, + "takeout_json": takeout_json_str, + } + if photo_taken_ts: + updates["exif_datetime"] = photo_taken_ts + + set_clause = ", ".join(f"{k} = ?" for k in updates) + values = list(updates.values()) + [media_path] + + cur.execute( + f"UPDATE files SET {set_clause}, updated_at = CURRENT_TIMESTAMP " + f"WHERE path = ?", + values, + ) + + # Handle edited flag — add is_edited column if needed (migration-safe) + if edited: + try: + cur.execute( + "UPDATE files SET is_edited = 1 WHERE path = ?", + (media_path,), + ) + except sqlite3.OperationalError: + pass # column doesn't exist yet, skip + + if cur.rowcount > 0: + enriched += 1 + + con.commit() + con.close() + return enriched diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3e59faa --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +services: + dup-finder: + build: . + container_name: dup-finder + restart: unless-stopped + ports: + - "8765:8000" + volumes: + # Mount the photo library — READ ONLY, never modified + - /volume1/photos:/photos:ro + # Database persistence + - /volume1/docker/dup-finder/data:/data + deploy: + resources: + limits: + cpus: "2.0" + memory: 2G diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c15bd5c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.115.6 +uvicorn==0.32.1 +Pillow==11.0.0 +imagehash==4.3.1 +pillow-heif==0.21.0 +jinja2==3.1.4 +aiofiles==24.1.0 diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..a085d6e --- /dev/null +++ b/templates/index.html @@ -0,0 +1,1392 @@ + + + + + +Duplicate Finder + + + + + +
+

DupFinder

+
idle
+
+ files + groups + pending + recoverable +
+
+ + +
+ + + + + +
+ + +
+
+
+
Total files
+
+
+
+
+
Duplicate groups
+
+
+
+
+
Space recoverable
+
+
if redundant removed
+
+
+
Reviewed
+
+
+
+
+ +
+
Scan library
+
+
+ + +
+
+ +
+
+ Scanning... + +
+
+
+
+
+ Discovery + Takeout + Indexing + Phash + Grouping +
+
+ +
+
+ +
+
+
+ +
+
+
+ + Only processes files added or modified since last scan. Prior decisions preserved. +
+
+ + Indexes newly added files only. Existing decisions untouched. +
+
+ + Re-runs duplicate detection on existing index. No re-hashing. +
+
+ + Wipes all data and re-scans everything. +
+
+
+
+ + +
Breakdown by method
+
+
+
⬤ Exact copies (SHA-256)
+
+
— files
+
+
+
⬤ Visual match (phash)
+
+
— files
+
+
+
⬤ Same moment (EXIF)
+
+
— files
+
+
+
⬤ Possible (size+dims)
+
+
— files
+
+
+
+ + + + + +
+
+
+ + +
+
+ + No files are moved or deleted. This tool only records decisions. +
+
+
+ +
+
+ + +
+
+

Confirm action

+

Are you sure?

+ +
+ + +
+
+
+ + +
+ + + +