Initial implementation of duplicate finder

Full project per spec: FastAPI backend, 4-method duplicate detection (SHA-256, phash, EXIF, filesize), Google Takeout pre-processor, 4 scan modes, and dark-theme vanilla JS gallery frontend. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 23:42:58 -04:00
commit 868da9016d
8 changed files with 2993 additions and 0 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,599 @@
+"""
+FastAPI application — all API routes for the duplicate finder.
+"""
+
+import csv
+import io
+import os
+import sqlite3
+import subprocess
+import threading
+from pathlib import Path
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException, Query, Request
+from fastapi.responses import (
+    FileResponse, JSONResponse, Response, StreamingResponse
+)
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from pydantic import BaseModel
+
+import scanner as sc
+
+app = FastAPI(title="Duplicate Finder")
+templates = Jinja2Templates(directory="/app/templates")
+
+app.mount("/static", StaticFiles(directory="/app/static"), name="static")
+
+METHOD_META = {
+    "sha256":   {"color": "#378ADD", "label": "Exact copy"},
+    "phash":    {"color": "#9b7de8", "label": "Visual match"},
+    "exif":     {"color": "#e2a43a", "label": "Same moment"},
+    "filesize": {"color": "#888780", "label": "Possible match"},
+}
+
+# ── Startup ───────────────────────────────────────────────────────────────────
+
+@app.on_event("startup")
+def startup():
+    sc.init_db()
+
+
+# ── Frontend ──────────────────────────────────────────────────────────────────
+
+@app.get("/")
+def index(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
+
+
+# ── DB helper ─────────────────────────────────────────────────────────────────
+
+def get_db() -> sqlite3.Connection:
+    return sc.get_db()
+
+
+# ── Scan management ───────────────────────────────────────────────────────────
+
+class ScanStartBody(BaseModel):
+    folder_path: str
+    mode: str = "incremental"
+
+
+@app.post("/api/scan/start")
+def scan_start(body: ScanStartBody):
+    if sc.scan_state["status"] == "running":
+        raise HTTPException(400, "A scan is already running")
+
+    mode = body.mode
+    if mode not in ("incremental", "full_reset", "new_files", "regroup"):
+        raise HTTPException(400, f"Unknown scan mode: {mode}")
+
+    if mode == "full_reset":
+        con = get_db()
+        cur = con.cursor()
+        cur.execute("DELETE FROM duplicate_members")
+        cur.execute("DELETE FROM duplicate_groups")
+        cur.execute("DELETE FROM files")
+        cur.execute("DELETE FROM scans")
+        con.commit()
+        con.close()
+
+    con = get_db()
+    cur = con.cursor()
+    cur.execute(
+        "INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
+        (body.folder_path,),
+    )
+    scan_id = cur.lastrowid
+    con.commit()
+    con.close()
+
+    sc.scan_state.update(
+        scan_id=scan_id,
+        status="running",
+        phase="discovery",
+        progress=0,
+        total=0,
+        message="Starting...",
+        cancel_requested=False,
+        stats={},
+    )
+
+    thread = threading.Thread(
+        target=sc.run_scan,
+        args=(body.folder_path, scan_id, mode),
+        daemon=True,
+    )
+    thread.start()
+
+    return {"scan_id": scan_id}
+
+
+@app.get("/api/scan/status")
+def scan_status():
+    state = sc.scan_state
+    con = get_db()
+    cur = con.cursor()
+
+    # Build group counts per method
+    stats = {}
+    for method in ("sha256", "phash", "exif", "filesize"):
+        cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
+        stats[f"{method}_groups"] = cur.fetchone()[0]
+
+    cur.execute("SELECT COUNT(*) FROM duplicate_groups")
+    stats["total_groups"] = cur.fetchone()[0]
+    cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
+    stats["reviewed"] = cur.fetchone()[0]
+    cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
+    stats["pending"] = cur.fetchone()[0]
+    cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
+    stats["total_files"] = cur.fetchone()[0]
+    con.close()
+
+    return {
+        "scan_id": state["scan_id"],
+        "status": state["status"],
+        "phase": state["phase"],
+        "progress": state["progress"],
+        "total": state["total"],
+        "message": state["message"],
+        "stats": stats,
+    }
+
+
+@app.post("/api/scan/cancel")
+def scan_cancel():
+    if sc.scan_state["status"] != "running":
+        raise HTTPException(400, "No scan is currently running")
+    sc.scan_state["cancel_requested"] = True
+    return {"success": True}
+
+
+@app.delete("/api/scan/reset")
+def scan_reset(confirm: str = Query("")):
+    if confirm != "RESET":
+        raise HTTPException(400, "Pass ?confirm=RESET to confirm")
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("DELETE FROM duplicate_members")
+    cur.execute("DELETE FROM duplicate_groups")
+    cur.execute("DELETE FROM files")
+    cur.execute("DELETE FROM scans")
+    con.commit()
+    con.close()
+    sc.scan_state.update(
+        scan_id=None, status="idle", phase="idle",
+        progress=0, total=0, message="", stats={},
+    )
+    return {"success": True}
+
+
+# ── Duplicate groups ──────────────────────────────────────────────────────────
+
+@app.get("/api/groups")
+def list_groups(
+    method: str = "all",
+    reviewed: str = "false",
+    sort: str = "count",
+    offset: int = 0,
+    limit: int = 50,
+):
+    con = get_db()
+    cur = con.cursor()
+
+    where = []
+    params: list = []
+
+    if method != "all":
+        where.append("dg.method = ?")
+        params.append(method)
+
+    if reviewed == "false":
+        where.append("dg.reviewed = 0")
+    elif reviewed == "true":
+        where.append("dg.reviewed = 1")
+
+    where_clause = ("WHERE " + " AND ".join(where)) if where else ""
+
+    order = {
+        "count": "member_count DESC",
+        "method": "dg.method, member_count DESC",
+        "date": "dg.created_at DESC",
+    }.get(sort, "member_count DESC")
+
+    cur.execute(f"""
+        SELECT COUNT(*) FROM duplicate_groups dg {where_clause}
+    """, params)
+    total = cur.fetchone()[0]
+
+    cur.execute(f"""
+        SELECT dg.id, dg.method, dg.reviewed,
+               COUNT(dm.id) as member_count,
+               (SELECT dm2.file_id FROM duplicate_members dm2
+                WHERE dm2.group_id = dg.id AND dm2.suggested = 1
+                LIMIT 1) as suggested_file_id
+        FROM duplicate_groups dg
+        LEFT JOIN duplicate_members dm ON dm.group_id = dg.id
+        {where_clause}
+        GROUP BY dg.id
+        ORDER BY {order}
+        LIMIT ? OFFSET ?
+    """, params + [limit, offset])
+
+    groups = []
+    for row in cur.fetchall():
+        meta = METHOD_META.get(row["method"], {"color": "#888", "label": row["method"]})
+        suggested = None
+        if row["suggested_file_id"]:
+            cur.execute(
+                "SELECT id, filename, width, height FROM files WHERE id=?",
+                (row["suggested_file_id"],),
+            )
+            f = cur.fetchone()
+            if f:
+                suggested = {
+                    "file_id": f["id"],
+                    "filename": f["filename"],
+                    "width": f["width"],
+                    "height": f["height"],
+                    "thumb_url": f"/api/thumb/{f['id']}",
+                }
+        groups.append({
+            "id": row["id"],
+            "method": row["method"],
+            "method_color": meta["color"],
+            "method_label": meta["label"],
+            "member_count": row["member_count"],
+            "reviewed": bool(row["reviewed"]),
+            "suggested_keeper": suggested,
+        })
+
+    con.close()
+    return {"total": total, "groups": groups}
+
+
+@app.get("/api/groups/{group_id}")
+def get_group(group_id: int):
+    con = get_db()
+    cur = con.cursor()
+
+    cur.execute("SELECT * FROM duplicate_groups WHERE id=?", (group_id,))
+    grp = cur.fetchone()
+    if not grp:
+        raise HTTPException(404, "Group not found")
+
+    meta = METHOD_META.get(grp["method"], {"color": "#888", "label": grp["method"]})
+
+    cur.execute("""
+        SELECT f.id, f.filename, f.path, f.file_size, f.width, f.height,
+               f.mime_type, f.exif_datetime, f.exif_device,
+               f.is_takeout, f.is_edited,
+               dm.is_keeper, dm.suggested
+        FROM duplicate_members dm
+        JOIN files f ON f.id = dm.file_id
+        WHERE dm.group_id = ?
+        ORDER BY dm.suggested DESC, f.width * f.height DESC
+    """, (group_id,))
+
+    members = []
+    for r in cur.fetchall():
+        members.append({
+            "file_id": r["id"],
+            "filename": r["filename"],
+            "path": r["path"],
+            "file_size": r["file_size"],
+            "width": r["width"],
+            "height": r["height"],
+            "mime_type": r["mime_type"],
+            "exif_datetime": r["exif_datetime"],
+            "exif_device": r["exif_device"],
+            "is_takeout": bool(r["is_takeout"]),
+            "is_edited": bool(r["is_edited"]) if r["is_edited"] is not None else False,
+            "is_suggested": bool(r["suggested"]),
+            "is_keeper": bool(r["is_keeper"]),
+            "thumb_url": f"/api/thumb/{r['id']}",
+        })
+
+    con.close()
+    return {
+        "id": grp["id"],
+        "method": grp["method"],
+        "method_color": meta["color"],
+        "method_label": meta["label"],
+        "method_value": grp["method_value"],
+        "reviewed": bool(grp["reviewed"]),
+        "members": members,
+    }
+
+
+# ── Decisions ─────────────────────────────────────────────────────────────────
+
+class DecideBody(BaseModel):
+    keeper_file_id: int
+
+
+@app.post("/api/groups/{group_id}/decide")
+def decide(group_id: int, body: DecideBody):
+    con = get_db()
+    cur = con.cursor()
+
+    cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
+    if not cur.fetchone():
+        raise HTTPException(404, "Group not found")
+
+    cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
+    all_members = [r["file_id"] for r in cur.fetchall()]
+
+    if body.keeper_file_id not in all_members:
+        raise HTTPException(400, "keeper_file_id is not a member of this group")
+
+    for fid in all_members:
+        is_k = 1 if fid == body.keeper_file_id else 0
+        cur.execute(
+            "UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
+            (is_k, group_id, fid),
+        )
+        status = "keeper" if is_k else "redundant"
+        cur.execute("UPDATE files SET status=? WHERE id=?", (status, fid))
+
+    cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
+    con.commit()
+    con.close()
+    return {"success": True}
+
+
+@app.post("/api/groups/{group_id}/skip")
+def skip_group(group_id: int):
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
+    if not cur.fetchone():
+        raise HTTPException(404, "Group not found")
+    cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
+    con.commit()
+    con.close()
+    return {"success": True}
+
+
+@app.post("/api/groups/{group_id}/keep-all")
+def keep_all(group_id: int):
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
+    if not cur.fetchone():
+        raise HTTPException(404, "Group not found")
+    cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
+    for r in cur.fetchall():
+        cur.execute(
+            "UPDATE duplicate_members SET is_keeper=1 WHERE group_id=? AND file_id=?",
+            (group_id, r["file_id"]),
+        )
+        cur.execute("UPDATE files SET status='keeper' WHERE id=?", (r["file_id"],))
+    cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
+    con.commit()
+    con.close()
+    return {"success": True}
+
+
+@app.post("/api/groups/{group_id}/unreview")
+def unreview_group(group_id: int):
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
+    if not cur.fetchone():
+        raise HTTPException(404, "Group not found")
+    cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
+    for r in cur.fetchall():
+        cur.execute(
+            "UPDATE duplicate_members SET is_keeper=0 WHERE group_id=? AND file_id=?",
+            (group_id, r["file_id"]),
+        )
+        cur.execute("UPDATE files SET status='pending' WHERE id=?", (r["file_id"],))
+    cur.execute("UPDATE duplicate_groups SET reviewed=0 WHERE id=?", (group_id,))
+    con.commit()
+    con.close()
+    return {"success": True}
+
+
+@app.post("/api/groups/auto-resolve-exact")
+def auto_resolve_exact():
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("""
+        SELECT id FROM duplicate_groups
+        WHERE method='sha256' AND reviewed=0
+    """)
+    groups = [r["id"] for r in cur.fetchall()]
+    resolved = 0
+
+    for gid in groups:
+        cur.execute("""
+            SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
+            FROM duplicate_members dm
+            JOIN files f ON f.id = dm.file_id
+            WHERE dm.group_id = ?
+        """, (gid,))
+        members = [dict(r) for r in cur.fetchall()]
+        if not members:
+            continue
+
+        keeper_id = sc._suggested_keeper_by_resolution(members)
+        for m in members:
+            is_k = 1 if m["id"] == keeper_id else 0
+            cur.execute(
+                "UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
+                (is_k, gid, m["id"]),
+            )
+            cur.execute(
+                "UPDATE files SET status=? WHERE id=?",
+                ("keeper" if is_k else "redundant", m["id"]),
+            )
+        cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (gid,))
+        resolved += 1
+
+    con.commit()
+    con.close()
+    return {"resolved": resolved}
+
+
+# ── Files + thumbnails ────────────────────────────────────────────────────────
+
+VIDEO_PLACEHOLDER_SVG = """<svg xmlns="http://www.w3.org/2000/svg" width="200" height="200"
+  viewBox="0 0 200 200">
+  <rect width="200" height="200" fill="#1e1e2e"/>
+  <polygon points="75,55 75,145 145,100" fill="#9b7de8"/>
+</svg>"""
+
+VIDEO_EXT = {".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", ".wmv", ".mts", ".m2ts"}
+
+
+@app.get("/api/thumb/{file_id}")
+def get_thumb(file_id: int):
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("SELECT path, mime_type, extension FROM files WHERE id=?", (file_id,))
+    row = cur.fetchone()
+    con.close()
+
+    if not row:
+        raise HTTPException(404, "File not found")
+
+    path = row["path"]
+    ext = (row["extension"] or "").lower()
+
+    if not os.path.isfile(path):
+        raise HTTPException(404, "File not on disk")
+
+    if ext in VIDEO_EXT:
+        # Try ffmpeg for first frame
+        try:
+            result = subprocess.run(
+                [
+                    "ffmpeg", "-i", path,
+                    "-vframes", "1", "-f", "image2", "-vcodec", "mjpeg",
+                    "pipe:1",
+                ],
+                capture_output=True, timeout=10,
+            )
+            if result.returncode == 0 and result.stdout:
+                return Response(content=result.stdout, media_type="image/jpeg")
+        except Exception:
+            pass
+        return Response(content=VIDEO_PLACEHOLDER_SVG, media_type="image/svg+xml")
+
+    # Serve photo directly
+    mime = row["mime_type"] or "application/octet-stream"
+    return FileResponse(path, media_type=mime)
+
+
+@app.get("/api/files/{file_id}")
+def get_file_meta(file_id: int):
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("SELECT * FROM files WHERE id=?", (file_id,))
+    row = cur.fetchone()
+    con.close()
+    if not row:
+        raise HTTPException(404, "File not found")
+    return dict(row)
+
+
+# ── Stats ─────────────────────────────────────────────────────────────────────
+
+@app.get("/api/stats")
+def get_stats():
+    con = get_db()
+    cur = con.cursor()
+
+    cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status != 'error'")
+    r = cur.fetchone()
+    total_files = r[0] or 0
+    total_size = r[1] or 0
+
+    cur.execute("""
+        SELECT COUNT(*), SUM(f.file_size)
+        FROM files f
+        JOIN duplicate_members dm ON dm.file_id = f.id
+        WHERE dm.is_keeper = 0
+    """)
+    r = cur.fetchone()
+    dup_files = r[0] or 0
+    dup_size = r[1] or 0
+
+    by_method = {}
+    for method in ("sha256", "phash", "exif", "filesize"):
+        cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
+        groups = cur.fetchone()[0]
+        cur.execute("""
+            SELECT COUNT(*) FROM duplicate_members dm
+            JOIN duplicate_groups dg ON dg.id = dm.group_id
+            WHERE dg.method = ?
+        """, (method,))
+        files = cur.fetchone()[0]
+        by_method[method] = {"groups": groups, "files": files}
+
+    cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
+    reviewed = cur.fetchone()[0]
+    cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
+    pending = cur.fetchone()[0]
+
+    cur.execute("SELECT COUNT(*) FROM files WHERE is_takeout=1")
+    takeout_files = cur.fetchone()[0]
+
+    con.close()
+    return {
+        "total_files": total_files,
+        "total_size_bytes": total_size,
+        "duplicate_files": dup_files,
+        "duplicate_size_bytes": dup_size,
+        "groups_by_method": by_method,
+        "reviewed": reviewed,
+        "pending": pending,
+        "takeout_files": takeout_files,
+    }
+
+
+# ── Export ────────────────────────────────────────────────────────────────────
+
+@app.get("/api/export/csv")
+def export_csv():
+    con = get_db()
+    cur = con.cursor()
+    cur.execute("""
+        SELECT dg.id as group_id, dg.method, f.id as file_id,
+               f.path, f.filename, f.file_size,
+               f.width, f.height, f.exif_datetime, f.exif_device,
+               dm.is_keeper,
+               CASE WHEN dm.is_keeper=0 AND dg.reviewed=1 THEN 1 ELSE 0 END as is_redundant,
+               dg.reviewed
+        FROM duplicate_groups dg
+        JOIN duplicate_members dm ON dm.group_id = dg.id
+        JOIN files f ON f.id = dm.file_id
+        ORDER BY dg.id, dm.is_keeper DESC
+    """)
+    rows = cur.fetchall()
+    con.close()
+
+    output = io.StringIO()
+    writer = csv.writer(output)
+    writer.writerow([
+        "group_id", "method", "file_id", "path", "filename",
+        "size", "width", "height", "exif_date", "device",
+        "is_keeper", "is_redundant", "reviewed",
+    ])
+    for r in rows:
+        writer.writerow([
+            r["group_id"], r["method"], r["file_id"],
+            r["path"], r["filename"], r["file_size"],
+            r["width"], r["height"], r["exif_datetime"], r["exif_device"],
+            r["is_keeper"], r["is_redundant"], r["reviewed"],
+        ])
+
+    output.seek(0)
+    return StreamingResponse(
+        iter([output.getvalue()]),
+        media_type="text/csv",
+        headers={"Content-Disposition": "attachment; filename=dup-finder-export.csv"},
+    )
--- a/app/scanner.py
+++ b/app/scanner.py
@@ -0,0 +1,758 @@
+"""
+File scanner: discovery, per-file extraction, and all 4 duplicate detection passes.
+"""
+
+import hashlib
+import mimetypes
+import os
+import sqlite3
+import subprocess
+from datetime import datetime
+from pathlib import Path
+
+import imagehash
+from PIL import Image, ExifTags, UnidentifiedImageError
+
+try:
+    from pillow_heif import register_heif_opener
+    register_heif_opener()
+except ImportError:
+    pass
+
+from takeout import is_takeout_folder, process_takeout
+
+
+PHOTO_EXT = {
+    ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif",
+    ".webp", ".heic", ".heif", ".raw", ".cr2", ".nef", ".arw",
+    ".dng", ".orf", ".rw2", ".pef", ".srw", ".x3f",
+}
+
+VIDEO_EXT = {
+    ".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp",
+    ".wmv", ".mts", ".m2ts",
+}
+
+SUPPORTED_EXT = PHOTO_EXT | VIDEO_EXT
+
+DB_PATH = "/data/dupfinder.db"
+
+# Shared scan state (updated by background thread, read by status endpoint)
+scan_state = {
+    "scan_id": None,
+    "status": "idle",       # idle | running | complete | error | cancelled
+    "phase": "idle",        # discovery | takeout | indexing | phash | grouping | done
+    "progress": 0,
+    "total": 0,
+    "message": "",
+    "cancel_requested": False,
+    "stats": {},
+}
+
+
+# ── DB helpers ────────────────────────────────────────────────────────────────
+
+def get_db() -> sqlite3.Connection:
+    con = sqlite3.connect(DB_PATH, timeout=30)
+    con.row_factory = sqlite3.Row
+    con.execute("PRAGMA journal_mode=WAL")
+    con.execute("PRAGMA foreign_keys=ON")
+    return con
+
+
+def init_db():
+    con = get_db()
+    cur = con.cursor()
+    cur.executescript("""
+        CREATE TABLE IF NOT EXISTS files (
+            id              INTEGER PRIMARY KEY AUTOINCREMENT,
+            path            TEXT UNIQUE NOT NULL,
+            filename        TEXT NOT NULL,
+            extension       TEXT,
+            file_size       INTEGER,
+            mime_type       TEXT,
+            sha256          TEXT,
+            phash           TEXT,
+            exif_datetime   TEXT,
+            exif_device     TEXT,
+            width           INTEGER,
+            height          INTEGER,
+            is_takeout      INTEGER DEFAULT 0,
+            is_edited       INTEGER DEFAULT 0,
+            takeout_json    TEXT,
+            scan_id         INTEGER,
+            status          TEXT DEFAULT 'pending',
+            created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            updated_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        );
+
+        CREATE TABLE IF NOT EXISTS scans (
+            id           INTEGER PRIMARY KEY AUTOINCREMENT,
+            folder_path  TEXT NOT NULL,
+            started_at   TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            completed_at TIMESTAMP,
+            total_files  INTEGER DEFAULT 0,
+            status       TEXT DEFAULT 'running'
+        );
+
+        CREATE TABLE IF NOT EXISTS duplicate_groups (
+            id           INTEGER PRIMARY KEY AUTOINCREMENT,
+            method       TEXT NOT NULL,
+            method_value TEXT,
+            reviewed     INTEGER DEFAULT 0,
+            created_at   TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        );
+
+        CREATE TABLE IF NOT EXISTS duplicate_members (
+            id        INTEGER PRIMARY KEY AUTOINCREMENT,
+            group_id  INTEGER REFERENCES duplicate_groups(id) ON DELETE CASCADE,
+            file_id   INTEGER REFERENCES files(id) ON DELETE CASCADE,
+            is_keeper INTEGER DEFAULT 0,
+            suggested INTEGER DEFAULT 0
+        );
+
+        CREATE INDEX IF NOT EXISTS idx_sha256   ON files(sha256);
+        CREATE INDEX IF NOT EXISTS idx_phash    ON files(phash);
+        CREATE INDEX IF NOT EXISTS idx_exif_dt  ON files(exif_datetime, exif_device);
+        CREATE INDEX IF NOT EXISTS idx_size_dim ON files(file_size, width, height);
+        CREATE INDEX IF NOT EXISTS idx_status   ON files(status);
+    """)
+    con.commit()
+    con.close()
+
+
+# ── Per-file extraction ───────────────────────────────────────────────────────
+
+def _sha256(path: str) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while chunk := f.read(65536):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def _exif_data(path: str) -> tuple[str | None, str | None]:
+    """Returns (exif_datetime, exif_device) or (None, None)."""
+    try:
+        img = Image.open(path)
+        exif_raw = img._getexif()
+        if not exif_raw:
+            return None, None
+        exif = {ExifTags.TAGS.get(k, k): v for k, v in exif_raw.items()}
+        dt = exif.get("DateTimeOriginal") or exif.get("DateTime")
+        if dt:
+            try:
+                dt = datetime.strptime(dt, "%Y:%m:%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S")
+            except ValueError:
+                dt = None
+        make = str(exif.get("Make", "")).strip()
+        model = str(exif.get("Model", "")).strip()
+        device = (make + " " + model).strip() if (make or model) else None
+        return dt, device
+    except Exception:
+        return None, None
+
+
+def _image_dims(path: str) -> tuple[int | None, int | None]:
+    try:
+        with Image.open(path) as img:
+            return img.size  # (width, height)
+    except Exception:
+        return None, None
+
+
+def _phash(path: str) -> str | None:
+    try:
+        with Image.open(path) as img:
+            return str(imagehash.phash(img))
+    except Exception:
+        return None
+
+
+def _video_dims(path: str) -> tuple[int | None, int | None]:
+    try:
+        result = subprocess.run(
+            [
+                "ffprobe", "-v", "error",
+                "-select_streams", "v:0",
+                "-show_entries", "stream=width,height",
+                "-of", "csv=p=0",
+                path,
+            ],
+            capture_output=True, text=True, timeout=10,
+        )
+        parts = result.stdout.strip().split(",")
+        if len(parts) == 2:
+            return int(parts[0]), int(parts[1])
+    except Exception:
+        pass
+    return None, None
+
+
+def _mtime_str(path: str) -> str | None:
+    try:
+        ts = os.path.getmtime(path)
+        return datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S")
+    except Exception:
+        return None
+
+
+def extract_file(path: str) -> dict:
+    ext = Path(path).suffix.lower()
+    filename = Path(path).name
+    is_photo = ext in PHOTO_EXT
+    is_video = ext in VIDEO_EXT
+
+    record = {
+        "path": path,
+        "filename": filename,
+        "extension": ext,
+        "file_size": None,
+        "mime_type": None,
+        "sha256": None,
+        "phash": None,
+        "exif_datetime": None,
+        "exif_device": None,
+        "width": None,
+        "height": None,
+    }
+
+    try:
+        record["file_size"] = os.path.getsize(path)
+    except OSError:
+        pass
+
+    record["mime_type"] = mimetypes.guess_type(path)[0]
+
+    try:
+        record["sha256"] = _sha256(path)
+    except OSError:
+        pass
+
+    if is_photo:
+        w, h = _image_dims(path)
+        record["width"], record["height"] = w, h
+        dt, device = _exif_data(path)
+        record["exif_datetime"] = dt or _mtime_str(path)
+        record["exif_device"] = device
+        # phash computed in separate phase for progress reporting
+
+    elif is_video:
+        w, h = _video_dims(path)
+        record["width"], record["height"] = w, h
+        record["exif_datetime"] = _mtime_str(path)
+
+    return record
+
+
+# ── Union-Find for phash grouping ────────────────────────────────────────────
+
+class UnionFind:
+    def __init__(self):
+        self.parent: dict[int, int] = {}
+
+    def find(self, x: int) -> int:
+        if x not in self.parent:
+            self.parent[x] = x
+        if self.parent[x] != x:
+            self.parent[x] = self.find(self.parent[x])
+        return self.parent[x]
+
+    def union(self, x: int, y: int):
+        px, py = self.find(x), self.find(y)
+        if px != py:
+            self.parent[px] = py
+
+    def groups(self) -> dict[int, list[int]]:
+        from collections import defaultdict
+        result: dict[int, list[int]] = defaultdict(list)
+        for x in self.parent:
+            result[self.find(x)].append(x)
+        return {k: v for k, v in result.items() if len(v) >= 2}
+
+
+# ── Detection passes ──────────────────────────────────────────────────────────
+
+def _suggested_keeper_by_resolution(members: list[dict]) -> int:
+    """Return file_id of highest resolution member; tie-break by size then oldest date."""
+    def score(m):
+        w = m["width"] or 0
+        h = m["height"] or 0
+        size = m["file_size"] or 0
+        dt = m["exif_datetime"] or "9999"
+        return (w * h, size, dt)
+
+    best = max(members, key=lambda m: (
+        (m["width"] or 0) * (m["height"] or 0),
+        m["file_size"] or 0,
+        # older date = better; invert by negating epoch or use str comparison inverted
+    ))
+    return best["id"]
+
+
+def _suggested_keeper_oldest(members: list[dict]) -> int:
+    def key(m):
+        return m["exif_datetime"] or "9999"
+    return min(members, key=key)["id"]
+
+
+def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
+    cur = con.cursor()
+    cur.execute("""
+        SELECT sha256, COUNT(*) as cnt
+        FROM files
+        WHERE sha256 IS NOT NULL
+        GROUP BY sha256
+        HAVING cnt > 1
+    """)
+    rows = cur.fetchall()
+    for row in rows:
+        sha = row["sha256"]
+        cur.execute("""
+            SELECT id, width, height, file_size, exif_datetime
+            FROM files WHERE sha256 = ?
+        """, (sha,))
+        members = [dict(r) for r in cur.fetchall()]
+
+        keeper_id = _suggested_keeper_by_resolution(members)
+
+        cur.execute(
+            "INSERT INTO duplicate_groups (method, method_value) VALUES ('sha256', ?)",
+            (sha,),
+        )
+        group_id = cur.lastrowid
+        for m in members:
+            cur.execute(
+                "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
+                (group_id, m["id"], 1 if m["id"] == keeper_id else 0),
+            )
+
+
+def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
+    cur = con.cursor()
+    # Exclude files already in sha256 groups
+    cur.execute("""
+        SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
+        FROM files f
+        WHERE f.phash IS NOT NULL
+          AND f.extension NOT IN (
+              '.mp4','.mov','.avi','.mkv','.m4v','.3gp','.wmv','.mts','.m2ts'
+          )
+          AND f.id NOT IN (
+              SELECT dm.file_id FROM duplicate_members dm
+              JOIN duplicate_groups dg ON dg.id = dm.group_id
+              WHERE dg.method = 'sha256'
+          )
+    """)
+    rows = [dict(r) for r in cur.fetchall()]
+
+    if len(rows) < 2:
+        return
+
+    # Bucket by first 2 hex chars to reduce O(n²) comparisons
+    buckets: dict[str, list[dict]] = {}
+    for r in rows:
+        key = r["phash"][:2]
+        buckets.setdefault(key, []).append(r)
+
+    uf = UnionFind()
+    # Ensure all IDs are registered
+    for r in rows:
+        uf.find(r["id"])
+
+    THRESHOLD = 10
+    for bucket in buckets.values():
+        for i in range(len(bucket)):
+            for j in range(i + 1, len(bucket)):
+                a, b = bucket[i], bucket[j]
+                try:
+                    dist = imagehash.hex_to_hash(a["phash"]) - imagehash.hex_to_hash(b["phash"])
+                    if dist <= THRESHOLD:
+                        uf.union(a["id"], b["id"])
+                except Exception:
+                    pass
+
+    id_map = {r["id"]: r for r in rows}
+    for _, member_ids in uf.groups().items():
+        members = [id_map[mid] for mid in member_ids if mid in id_map]
+        if len(members) < 2:
+            continue
+        keeper_id = _suggested_keeper_by_resolution(members)
+        keeper = id_map[keeper_id]
+        cur.execute(
+            "INSERT INTO duplicate_groups (method, method_value) VALUES ('phash', ?)",
+            (keeper["phash"],),
+        )
+        group_id = cur.lastrowid
+        for m in members:
+            cur.execute(
+                "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
+                (group_id, m["id"], 1 if m["id"] == keeper_id else 0),
+            )
+
+
+def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
+    cur = con.cursor()
+    cur.execute("""
+        SELECT exif_datetime, exif_device, COUNT(*) as cnt
+        FROM files
+        WHERE exif_datetime IS NOT NULL
+          AND exif_device IS NOT NULL
+          AND id NOT IN (
+              SELECT file_id FROM duplicate_members dm
+              JOIN duplicate_groups dg ON dg.id = dm.group_id
+              WHERE dg.method IN ('sha256', 'phash')
+          )
+        GROUP BY exif_datetime, exif_device
+        HAVING cnt > 1
+    """)
+    rows = cur.fetchall()
+    for row in rows:
+        dt, dev = row["exif_datetime"], row["exif_device"]
+        cur.execute("""
+            SELECT id, width, height, file_size, exif_datetime
+            FROM files
+            WHERE exif_datetime = ? AND exif_device = ?
+        """, (dt, dev))
+        members = [dict(r) for r in cur.fetchall()]
+        keeper_id = _suggested_keeper_by_resolution(members)
+        method_value = f"{dt}::{dev}"
+        cur.execute(
+            "INSERT INTO duplicate_groups (method, method_value) VALUES ('exif', ?)",
+            (method_value,),
+        )
+        group_id = cur.lastrowid
+        for m in members:
+            cur.execute(
+                "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
+                (group_id, m["id"], 1 if m["id"] == keeper_id else 0),
+            )
+
+
+def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
+    cur = con.cursor()
+    cur.execute("""
+        SELECT file_size, width, height, COUNT(*) as cnt
+        FROM files
+        WHERE file_size IS NOT NULL
+          AND width IS NOT NULL
+          AND height IS NOT NULL
+          AND id NOT IN (
+              SELECT file_id FROM duplicate_members dm
+              JOIN duplicate_groups dg ON dg.id = dm.group_id
+              WHERE dg.method IN ('sha256', 'phash', 'exif')
+          )
+        GROUP BY file_size, width, height
+        HAVING cnt > 1
+    """)
+    rows = cur.fetchall()
+    for row in rows:
+        fs, w, h = row["file_size"], row["width"], row["height"]
+        cur.execute("""
+            SELECT id, width, height, file_size, exif_datetime
+            FROM files
+            WHERE file_size = ? AND width = ? AND height = ?
+        """, (fs, w, h))
+        members = [dict(r) for r in cur.fetchall()]
+        keeper_id = _suggested_keeper_oldest(members)
+        method_value = f"{fs}::{w}x{h}"
+        cur.execute(
+            "INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
+            (method_value,),
+        )
+        group_id = cur.lastrowid
+        for m in members:
+            cur.execute(
+                "INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
+                (group_id, m["id"], 1 if m["id"] == keeper_id else 0),
+            )
+
+
+# ── Main scan entry point ─────────────────────────────────────────────────────
+
+def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
+    """Main scan function — runs in background thread."""
+    global scan_state
+    con = get_db()
+    cur = con.cursor()
+
+    try:
+        # ── Phase: discovery ──────────────────────────────────────────────
+        scan_state.update(phase="discovery", progress=0, total=0,
+                          message="Discovering files...")
+
+        all_files = []
+        for root, dirs, files in os.walk(folder_path):
+            dirs[:] = [d for d in dirs if not d.startswith(".")]
+            for fname in files:
+                if fname.endswith(".json"):
+                    continue
+                ext = Path(fname).suffix.lower()
+                if ext in SUPPORTED_EXT:
+                    all_files.append(os.path.join(root, fname))
+
+        scan_state["total"] = len(all_files)
+        scan_state["message"] = f"Found {len(all_files):,} files"
+
+        if scan_state["cancel_requested"]:
+            _mark_scan(cur, scan_id, "cancelled")
+            con.commit()
+            scan_state["status"] = "cancelled"
+            return
+
+        # ── Mode: full reset ──────────────────────────────────────────────
+        if mode == "full_reset":
+            cur.execute("DELETE FROM duplicate_members")
+            cur.execute("DELETE FROM duplicate_groups")
+            cur.execute("DELETE FROM files")
+            con.commit()
+
+        # ── Phase: takeout pre-processing ─────────────────────────────────
+        scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
+        if is_takeout_folder(folder_path):
+            scan_state["message"] = "Processing Google Takeout sidecars..."
+            process_takeout(folder_path, DB_PATH)
+
+        if scan_state["cancel_requested"]:
+            _mark_scan(cur, scan_id, "cancelled")
+            con.commit()
+            scan_state["status"] = "cancelled"
+            return
+
+        # ── Phase: indexing ───────────────────────────────────────────────
+        scan_state.update(phase="indexing", progress=0,
+                          message="Indexing files (SHA-256 + EXIF + dimensions)...")
+
+        for i, path in enumerate(all_files):
+            if scan_state["cancel_requested"]:
+                _mark_scan(cur, scan_id, "cancelled")
+                con.commit()
+                scan_state["status"] = "cancelled"
+                return
+
+            scan_state["progress"] = i + 1
+            scan_state["message"] = f"Indexing: {Path(path).name}"
+
+            # Check existing record
+            cur.execute("SELECT id, file_size, updated_at FROM files WHERE path = ?", (path,))
+            existing = cur.fetchone()
+
+            try:
+                current_size = os.path.getsize(path)
+            except OSError:
+                continue
+
+            if existing and mode in ("incremental", "new_files"):
+                if mode == "new_files":
+                    # Skip entirely — don't re-hash existing files
+                    cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
+                    continue
+                # Incremental: skip if size unchanged (use size as proxy for change)
+                if existing["file_size"] == current_size:
+                    cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
+                    continue
+                # File changed — re-hash, clear group memberships
+                cur.execute(
+                    "DELETE FROM duplicate_members WHERE file_id = ?", (existing["id"],)
+                )
+
+            try:
+                record = extract_file(path)
+            except Exception as e:
+                cur.execute(
+                    "INSERT OR IGNORE INTO files (path, filename, extension, scan_id, status) "
+                    "VALUES (?, ?, ?, ?, 'error')",
+                    (path, Path(path).name, Path(path).suffix.lower(), scan_id),
+                )
+                cur.execute(
+                    "UPDATE files SET status='error', scan_id=?, updated_at=CURRENT_TIMESTAMP "
+                    "WHERE path=?",
+                    (scan_id, path),
+                )
+                con.commit()
+                continue
+
+            record["scan_id"] = scan_id
+            if existing:
+                cur.execute("""
+                    UPDATE files SET
+                        filename=:filename, extension=:extension, file_size=:file_size,
+                        mime_type=:mime_type, sha256=:sha256,
+                        exif_datetime=:exif_datetime, exif_device=:exif_device,
+                        width=:width, height=:height, scan_id=:scan_id,
+                        status='pending', updated_at=CURRENT_TIMESTAMP
+                    WHERE path=:path
+                """, record)
+            else:
+                cur.execute("""
+                    INSERT OR IGNORE INTO files
+                        (path, filename, extension, file_size, mime_type, sha256,
+                         exif_datetime, exif_device, width, height, scan_id, status)
+                    VALUES
+                        (:path, :filename, :extension, :file_size, :mime_type, :sha256,
+                         :exif_datetime, :exif_device, :width, :height, :scan_id, 'pending')
+                """, record)
+
+            if (i + 1) % 100 == 0:
+                con.commit()
+
+        con.commit()
+
+        # ── Phase: phash ──────────────────────────────────────────────────
+        scan_state.update(phase="phash", progress=0,
+                          message="Computing perceptual hashes...")
+
+        cur.execute("""
+            SELECT id, path FROM files
+            WHERE extension IN (
+                '.jpg','.jpeg','.png','.gif','.bmp','.tiff','.tif',
+                '.webp','.heic','.heif','.raw','.cr2','.nef','.arw',
+                '.dng','.orf','.rw2','.pef','.srw','.x3f'
+            ) AND phash IS NULL AND status != 'error'
+        """)
+        photo_rows = cur.fetchall()
+        scan_state["total"] = len(photo_rows)
+
+        for i, row in enumerate(photo_rows):
+            if scan_state["cancel_requested"]:
+                _mark_scan(cur, scan_id, "cancelled")
+                con.commit()
+                scan_state["status"] = "cancelled"
+                return
+
+            scan_state["progress"] = i + 1
+            scan_state["message"] = f"Phash: {Path(row['path']).name}"
+            ph = _phash(row["path"])
+            if ph:
+                cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
+            if (i + 1) % 200 == 0:
+                con.commit()
+
+        con.commit()
+
+        # ── Phase: grouping ───────────────────────────────────────────────
+        scan_state.update(phase="grouping", progress=0, total=4,
+                          message="Running duplicate detection...")
+
+        if mode in ("incremental", "full_reset", "regroup"):
+            cur.execute("DELETE FROM duplicate_members")
+            cur.execute("DELETE FROM duplicate_groups")
+            con.commit()
+        elif mode == "new_files":
+            # Only clear groups containing new files
+            cur.execute("""
+                DELETE FROM duplicate_groups WHERE id IN (
+                    SELECT DISTINCT dm.group_id FROM duplicate_members dm
+                    JOIN files f ON f.id = dm.file_id
+                    WHERE f.scan_id = ?
+                )
+            """, (scan_id,))
+            con.commit()
+
+        scan_state["message"] = "Pass 1/4: SHA-256 exact duplicates..."
+        _run_sha256_pass(con, scan_id)
+        scan_state["progress"] = 1
+        con.commit()
+
+        scan_state["message"] = "Pass 2/4: Perceptual hash similarity..."
+        _run_phash_pass(con, scan_id)
+        scan_state["progress"] = 2
+        con.commit()
+
+        scan_state["message"] = "Pass 3/4: EXIF timestamp + device..."
+        _run_exif_pass(con, scan_id)
+        scan_state["progress"] = 3
+        con.commit()
+
+        scan_state["message"] = "Pass 4/4: File size + dimensions..."
+        _run_filesize_pass(con, scan_id)
+        scan_state["progress"] = 4
+        con.commit()
+
+        # ── Restore keeper statuses for mode=incremental ──────────────────
+        if mode == "incremental":
+            # If a previously marked keeper no longer appears in any group, reset to pending
+            cur.execute("""
+                UPDATE files SET status='pending'
+                WHERE status='keeper'
+                  AND id NOT IN (
+                      SELECT file_id FROM duplicate_members WHERE is_keeper=1
+                  )
+            """)
+            con.commit()
+
+        # Update scan record
+        cur.execute(
+            "UPDATE scans SET completed_at=CURRENT_TIMESTAMP, total_files=?, status='complete' "
+            "WHERE id=?",
+            (len(all_files), scan_id),
+        )
+        con.commit()
+
+        scan_state.update(status="complete", phase="done",
+                          message="Scan complete.", progress=scan_state["total"])
+        _update_stats()
+
+    except Exception as e:
+        scan_state.update(status="error", message=str(e))
+        try:
+            _mark_scan(cur, scan_id, "error")
+            con.commit()
+        except Exception:
+            pass
+    finally:
+        con.close()
+
+
+def _mark_scan(cur, scan_id: int, status: str):
+    cur.execute(
+        "UPDATE scans SET completed_at=CURRENT_TIMESTAMP, status=? WHERE id=?",
+        (status, scan_id),
+    )
+
+
+def _update_stats():
+    """Refresh stats in scan_state."""
+    try:
+        con = get_db()
+        cur = con.cursor()
+        cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
+        total_files = cur.fetchone()[0]
+
+        cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status='redundant'")
+        r = cur.fetchone()
+        dup_count = r[0] or 0
+        dup_size = r[1] or 0
+
+        for method in ("sha256", "phash", "exif", "filesize"):
+            cur.execute(
+                "SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,)
+            )
+        cur.execute("""
+            SELECT method,
+                COUNT(*) as groups,
+                (SELECT COUNT(*) FROM duplicate_members dm2
+                 JOIN duplicate_groups dg2 ON dg2.id=dm2.group_id
+                 WHERE dg2.method=dg.method) as files
+            FROM duplicate_groups dg
+            GROUP BY method
+        """)
+        by_method = {r["method"]: {"groups": r["groups"], "files": r["files"]}
+                     for r in cur.fetchall()}
+
+        cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
+        reviewed = cur.fetchone()[0]
+        cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
+        pending = cur.fetchone()[0]
+
+        scan_state["stats"] = {
+            "total_files": total_files,
+            "duplicate_files": dup_count,
+            "duplicate_size_bytes": dup_size,
+            "groups_by_method": by_method,
+            "reviewed": reviewed,
+            "pending": pending,
+        }
+        con.close()
+    except Exception:
+        pass
--- a/app/takeout.py
+++ b/app/takeout.py
@@ -0,0 +1,149 @@
+"""
+Google Takeout pre-processor.
+Detects Takeout folder structures, reads JSON sidecars, and enriches
+the files table with corrected timestamps, normalized filenames, and
+edit-version flags.
+"""
+
+import json
+import os
+import re
+import sqlite3
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+# Google edit suffixes appended to filenames
+EDIT_SUFFIXES = ("-edited", "-effects", "-smile", "-mix")
+
+
+def _find_sidecar(media_path: str) -> str | None:
+    """Return path to the JSON sidecar for a media file, or None."""
+    p = Path(media_path)
+    # Try filename.ext.json first, then filename.json
+    candidates = [
+        str(p) + ".json",
+        str(p.with_suffix(".json")),
+    ]
+    for c in candidates:
+        if os.path.isfile(c):
+            return c
+    return None
+
+
+def _strip_collision_suffix(filename: str) -> str:
+    """Strip Google's (1), (2) collision suffixes from a filename."""
+    stem = Path(filename).stem
+    ext = Path(filename).suffix
+    cleaned = re.sub(r"\(\d+\)$", "", stem).rstrip()
+    return cleaned + ext
+
+
+def _is_edited(filename: str) -> bool:
+    stem = Path(filename).stem.lower()
+    return any(stem.endswith(s) for s in EDIT_SUFFIXES)
+
+
+def is_takeout_folder(folder_path: str) -> bool:
+    """
+    Heuristic: walk folder looking for .json files whose names match
+    adjacent media files. If we find at least 5 such pairs, call it Takeout.
+    """
+    count = 0
+    for root, dirs, files in os.walk(folder_path):
+        # Skip hidden dirs
+        dirs[:] = [d for d in dirs if not d.startswith(".")]
+        file_set = set(files)
+        for f in files:
+            if not f.endswith(".json"):
+                continue
+            # Check if a media file exists that this could be a sidecar for
+            base = f[:-5]  # strip .json
+            if base in file_set:
+                count += 1
+            if count >= 5:
+                return True
+    return False
+
+
+def process_takeout(folder_path: str, db_path: str) -> int:
+    """
+    Walk folder_path, find all media files with JSON sidecars,
+    and enrich their DB records. Returns count of files enriched.
+    """
+    con = sqlite3.connect(db_path)
+    con.row_factory = sqlite3.Row
+    cur = con.cursor()
+
+    enriched = 0
+
+    for root, dirs, files in os.walk(folder_path):
+        dirs[:] = [d for d in dirs if not d.startswith(".")]
+        for fname in files:
+            if fname.endswith(".json"):
+                continue
+            media_path = os.path.join(root, fname)
+            sidecar = _find_sidecar(media_path)
+            if not sidecar:
+                continue
+
+            try:
+                with open(sidecar, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+            except (json.JSONDecodeError, OSError):
+                continue
+
+            # Extract fields from sidecar
+            photo_taken_ts = None
+            try:
+                ts = int(data["photoTakenTime"]["timestamp"])
+                dt = datetime.fromtimestamp(ts, tz=timezone.utc)
+                photo_taken_ts = dt.strftime("%Y-%m-%dT%H:%M:%S")
+            except (KeyError, ValueError, TypeError):
+                pass
+
+            title = data.get("title", "")
+            takeout_json_str = json.dumps(data)
+
+            # Normalized filename: use title if present, else strip suffix from fname
+            if title:
+                normalized = _strip_collision_suffix(title)
+            else:
+                normalized = _strip_collision_suffix(fname)
+
+            edited = _is_edited(fname)
+
+            # Update the DB record for this file
+            updates = {
+                "is_takeout": 1,
+                "filename": normalized,
+                "takeout_json": takeout_json_str,
+            }
+            if photo_taken_ts:
+                updates["exif_datetime"] = photo_taken_ts
+
+            set_clause = ", ".join(f"{k} = ?" for k in updates)
+            values = list(updates.values()) + [media_path]
+
+            cur.execute(
+                f"UPDATE files SET {set_clause}, updated_at = CURRENT_TIMESTAMP "
+                f"WHERE path = ?",
+                values,
+            )
+
+            # Handle edited flag — add is_edited column if needed (migration-safe)
+            if edited:
+                try:
+                    cur.execute(
+                        "UPDATE files SET is_edited = 1 WHERE path = ?",
+                        (media_path,),
+                    )
+                except sqlite3.OperationalError:
+                    pass  # column doesn't exist yet, skip
+
+            if cur.rowcount > 0:
+                enriched += 1
+
+    con.commit()
+    con.close()
+    return enriched