Initial implementation of duplicate finder
Full project per spec: FastAPI backend, 4-method duplicate detection (SHA-256, phash, EXIF, filesize), Google Takeout pre-processor, 4 scan modes, and dark-theme vanilla JS gallery frontend. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
599
app/main.py
Normal file
599
app/main.py
Normal file
@@ -0,0 +1,599 @@
|
||||
"""
|
||||
FastAPI application — all API routes for the duplicate finder.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Query, Request
|
||||
from fastapi.responses import (
|
||||
FileResponse, JSONResponse, Response, StreamingResponse
|
||||
)
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from pydantic import BaseModel
|
||||
|
||||
import scanner as sc
|
||||
|
||||
app = FastAPI(title="Duplicate Finder")
|
||||
templates = Jinja2Templates(directory="/app/templates")
|
||||
|
||||
app.mount("/static", StaticFiles(directory="/app/static"), name="static")
|
||||
|
||||
METHOD_META = {
|
||||
"sha256": {"color": "#378ADD", "label": "Exact copy"},
|
||||
"phash": {"color": "#9b7de8", "label": "Visual match"},
|
||||
"exif": {"color": "#e2a43a", "label": "Same moment"},
|
||||
"filesize": {"color": "#888780", "label": "Possible match"},
|
||||
}
|
||||
|
||||
# ── Startup ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup():
|
||||
sc.init_db()
|
||||
|
||||
|
||||
# ── Frontend ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/")
|
||||
def index(request: Request):
|
||||
return templates.TemplateResponse("index.html", {"request": request})
|
||||
|
||||
|
||||
# ── DB helper ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_db() -> sqlite3.Connection:
|
||||
return sc.get_db()
|
||||
|
||||
|
||||
# ── Scan management ───────────────────────────────────────────────────────────
|
||||
|
||||
class ScanStartBody(BaseModel):
|
||||
folder_path: str
|
||||
mode: str = "incremental"
|
||||
|
||||
|
||||
@app.post("/api/scan/start")
|
||||
def scan_start(body: ScanStartBody):
|
||||
if sc.scan_state["status"] == "running":
|
||||
raise HTTPException(400, "A scan is already running")
|
||||
|
||||
mode = body.mode
|
||||
if mode not in ("incremental", "full_reset", "new_files", "regroup"):
|
||||
raise HTTPException(400, f"Unknown scan mode: {mode}")
|
||||
|
||||
if mode == "full_reset":
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("DELETE FROM duplicate_members")
|
||||
cur.execute("DELETE FROM duplicate_groups")
|
||||
cur.execute("DELETE FROM files")
|
||||
cur.execute("DELETE FROM scans")
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
|
||||
(body.folder_path,),
|
||||
)
|
||||
scan_id = cur.lastrowid
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
sc.scan_state.update(
|
||||
scan_id=scan_id,
|
||||
status="running",
|
||||
phase="discovery",
|
||||
progress=0,
|
||||
total=0,
|
||||
message="Starting...",
|
||||
cancel_requested=False,
|
||||
stats={},
|
||||
)
|
||||
|
||||
thread = threading.Thread(
|
||||
target=sc.run_scan,
|
||||
args=(body.folder_path, scan_id, mode),
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
|
||||
return {"scan_id": scan_id}
|
||||
|
||||
|
||||
@app.get("/api/scan/status")
|
||||
def scan_status():
|
||||
state = sc.scan_state
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
# Build group counts per method
|
||||
stats = {}
|
||||
for method in ("sha256", "phash", "exif", "filesize"):
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
|
||||
stats[f"{method}_groups"] = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups")
|
||||
stats["total_groups"] = cur.fetchone()[0]
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
|
||||
stats["reviewed"] = cur.fetchone()[0]
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
|
||||
stats["pending"] = cur.fetchone()[0]
|
||||
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
|
||||
stats["total_files"] = cur.fetchone()[0]
|
||||
con.close()
|
||||
|
||||
return {
|
||||
"scan_id": state["scan_id"],
|
||||
"status": state["status"],
|
||||
"phase": state["phase"],
|
||||
"progress": state["progress"],
|
||||
"total": state["total"],
|
||||
"message": state["message"],
|
||||
"stats": stats,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/scan/cancel")
|
||||
def scan_cancel():
|
||||
if sc.scan_state["status"] != "running":
|
||||
raise HTTPException(400, "No scan is currently running")
|
||||
sc.scan_state["cancel_requested"] = True
|
||||
return {"success": True}
|
||||
|
||||
|
||||
@app.delete("/api/scan/reset")
|
||||
def scan_reset(confirm: str = Query("")):
|
||||
if confirm != "RESET":
|
||||
raise HTTPException(400, "Pass ?confirm=RESET to confirm")
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("DELETE FROM duplicate_members")
|
||||
cur.execute("DELETE FROM duplicate_groups")
|
||||
cur.execute("DELETE FROM files")
|
||||
cur.execute("DELETE FROM scans")
|
||||
con.commit()
|
||||
con.close()
|
||||
sc.scan_state.update(
|
||||
scan_id=None, status="idle", phase="idle",
|
||||
progress=0, total=0, message="", stats={},
|
||||
)
|
||||
return {"success": True}
|
||||
|
||||
|
||||
# ── Duplicate groups ──────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/api/groups")
|
||||
def list_groups(
|
||||
method: str = "all",
|
||||
reviewed: str = "false",
|
||||
sort: str = "count",
|
||||
offset: int = 0,
|
||||
limit: int = 50,
|
||||
):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
where = []
|
||||
params: list = []
|
||||
|
||||
if method != "all":
|
||||
where.append("dg.method = ?")
|
||||
params.append(method)
|
||||
|
||||
if reviewed == "false":
|
||||
where.append("dg.reviewed = 0")
|
||||
elif reviewed == "true":
|
||||
where.append("dg.reviewed = 1")
|
||||
|
||||
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
|
||||
|
||||
order = {
|
||||
"count": "member_count DESC",
|
||||
"method": "dg.method, member_count DESC",
|
||||
"date": "dg.created_at DESC",
|
||||
}.get(sort, "member_count DESC")
|
||||
|
||||
cur.execute(f"""
|
||||
SELECT COUNT(*) FROM duplicate_groups dg {where_clause}
|
||||
""", params)
|
||||
total = cur.fetchone()[0]
|
||||
|
||||
cur.execute(f"""
|
||||
SELECT dg.id, dg.method, dg.reviewed,
|
||||
COUNT(dm.id) as member_count,
|
||||
(SELECT dm2.file_id FROM duplicate_members dm2
|
||||
WHERE dm2.group_id = dg.id AND dm2.suggested = 1
|
||||
LIMIT 1) as suggested_file_id
|
||||
FROM duplicate_groups dg
|
||||
LEFT JOIN duplicate_members dm ON dm.group_id = dg.id
|
||||
{where_clause}
|
||||
GROUP BY dg.id
|
||||
ORDER BY {order}
|
||||
LIMIT ? OFFSET ?
|
||||
""", params + [limit, offset])
|
||||
|
||||
groups = []
|
||||
for row in cur.fetchall():
|
||||
meta = METHOD_META.get(row["method"], {"color": "#888", "label": row["method"]})
|
||||
suggested = None
|
||||
if row["suggested_file_id"]:
|
||||
cur.execute(
|
||||
"SELECT id, filename, width, height FROM files WHERE id=?",
|
||||
(row["suggested_file_id"],),
|
||||
)
|
||||
f = cur.fetchone()
|
||||
if f:
|
||||
suggested = {
|
||||
"file_id": f["id"],
|
||||
"filename": f["filename"],
|
||||
"width": f["width"],
|
||||
"height": f["height"],
|
||||
"thumb_url": f"/api/thumb/{f['id']}",
|
||||
}
|
||||
groups.append({
|
||||
"id": row["id"],
|
||||
"method": row["method"],
|
||||
"method_color": meta["color"],
|
||||
"method_label": meta["label"],
|
||||
"member_count": row["member_count"],
|
||||
"reviewed": bool(row["reviewed"]),
|
||||
"suggested_keeper": suggested,
|
||||
})
|
||||
|
||||
con.close()
|
||||
return {"total": total, "groups": groups}
|
||||
|
||||
|
||||
@app.get("/api/groups/{group_id}")
|
||||
def get_group(group_id: int):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
cur.execute("SELECT * FROM duplicate_groups WHERE id=?", (group_id,))
|
||||
grp = cur.fetchone()
|
||||
if not grp:
|
||||
raise HTTPException(404, "Group not found")
|
||||
|
||||
meta = METHOD_META.get(grp["method"], {"color": "#888", "label": grp["method"]})
|
||||
|
||||
cur.execute("""
|
||||
SELECT f.id, f.filename, f.path, f.file_size, f.width, f.height,
|
||||
f.mime_type, f.exif_datetime, f.exif_device,
|
||||
f.is_takeout, f.is_edited,
|
||||
dm.is_keeper, dm.suggested
|
||||
FROM duplicate_members dm
|
||||
JOIN files f ON f.id = dm.file_id
|
||||
WHERE dm.group_id = ?
|
||||
ORDER BY dm.suggested DESC, f.width * f.height DESC
|
||||
""", (group_id,))
|
||||
|
||||
members = []
|
||||
for r in cur.fetchall():
|
||||
members.append({
|
||||
"file_id": r["id"],
|
||||
"filename": r["filename"],
|
||||
"path": r["path"],
|
||||
"file_size": r["file_size"],
|
||||
"width": r["width"],
|
||||
"height": r["height"],
|
||||
"mime_type": r["mime_type"],
|
||||
"exif_datetime": r["exif_datetime"],
|
||||
"exif_device": r["exif_device"],
|
||||
"is_takeout": bool(r["is_takeout"]),
|
||||
"is_edited": bool(r["is_edited"]) if r["is_edited"] is not None else False,
|
||||
"is_suggested": bool(r["suggested"]),
|
||||
"is_keeper": bool(r["is_keeper"]),
|
||||
"thumb_url": f"/api/thumb/{r['id']}",
|
||||
})
|
||||
|
||||
con.close()
|
||||
return {
|
||||
"id": grp["id"],
|
||||
"method": grp["method"],
|
||||
"method_color": meta["color"],
|
||||
"method_label": meta["label"],
|
||||
"method_value": grp["method_value"],
|
||||
"reviewed": bool(grp["reviewed"]),
|
||||
"members": members,
|
||||
}
|
||||
|
||||
|
||||
# ── Decisions ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class DecideBody(BaseModel):
|
||||
keeper_file_id: int
|
||||
|
||||
|
||||
@app.post("/api/groups/{group_id}/decide")
|
||||
def decide(group_id: int, body: DecideBody):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
||||
if not cur.fetchone():
|
||||
raise HTTPException(404, "Group not found")
|
||||
|
||||
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
||||
all_members = [r["file_id"] for r in cur.fetchall()]
|
||||
|
||||
if body.keeper_file_id not in all_members:
|
||||
raise HTTPException(400, "keeper_file_id is not a member of this group")
|
||||
|
||||
for fid in all_members:
|
||||
is_k = 1 if fid == body.keeper_file_id else 0
|
||||
cur.execute(
|
||||
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
|
||||
(is_k, group_id, fid),
|
||||
)
|
||||
status = "keeper" if is_k else "redundant"
|
||||
cur.execute("UPDATE files SET status=? WHERE id=?", (status, fid))
|
||||
|
||||
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
|
||||
con.commit()
|
||||
con.close()
|
||||
return {"success": True}
|
||||
|
||||
|
||||
@app.post("/api/groups/{group_id}/skip")
|
||||
def skip_group(group_id: int):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
||||
if not cur.fetchone():
|
||||
raise HTTPException(404, "Group not found")
|
||||
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
|
||||
con.commit()
|
||||
con.close()
|
||||
return {"success": True}
|
||||
|
||||
|
||||
@app.post("/api/groups/{group_id}/keep-all")
|
||||
def keep_all(group_id: int):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
||||
if not cur.fetchone():
|
||||
raise HTTPException(404, "Group not found")
|
||||
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
||||
for r in cur.fetchall():
|
||||
cur.execute(
|
||||
"UPDATE duplicate_members SET is_keeper=1 WHERE group_id=? AND file_id=?",
|
||||
(group_id, r["file_id"]),
|
||||
)
|
||||
cur.execute("UPDATE files SET status='keeper' WHERE id=?", (r["file_id"],))
|
||||
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
|
||||
con.commit()
|
||||
con.close()
|
||||
return {"success": True}
|
||||
|
||||
|
||||
@app.post("/api/groups/{group_id}/unreview")
|
||||
def unreview_group(group_id: int):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
||||
if not cur.fetchone():
|
||||
raise HTTPException(404, "Group not found")
|
||||
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
||||
for r in cur.fetchall():
|
||||
cur.execute(
|
||||
"UPDATE duplicate_members SET is_keeper=0 WHERE group_id=? AND file_id=?",
|
||||
(group_id, r["file_id"]),
|
||||
)
|
||||
cur.execute("UPDATE files SET status='pending' WHERE id=?", (r["file_id"],))
|
||||
cur.execute("UPDATE duplicate_groups SET reviewed=0 WHERE id=?", (group_id,))
|
||||
con.commit()
|
||||
con.close()
|
||||
return {"success": True}
|
||||
|
||||
|
||||
@app.post("/api/groups/auto-resolve-exact")
|
||||
def auto_resolve_exact():
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT id FROM duplicate_groups
|
||||
WHERE method='sha256' AND reviewed=0
|
||||
""")
|
||||
groups = [r["id"] for r in cur.fetchall()]
|
||||
resolved = 0
|
||||
|
||||
for gid in groups:
|
||||
cur.execute("""
|
||||
SELECT f.id, f.width, f.height, f.file_size, f.exif_datetime
|
||||
FROM duplicate_members dm
|
||||
JOIN files f ON f.id = dm.file_id
|
||||
WHERE dm.group_id = ?
|
||||
""", (gid,))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
if not members:
|
||||
continue
|
||||
|
||||
keeper_id = sc._suggested_keeper_by_resolution(members)
|
||||
for m in members:
|
||||
is_k = 1 if m["id"] == keeper_id else 0
|
||||
cur.execute(
|
||||
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
|
||||
(is_k, gid, m["id"]),
|
||||
)
|
||||
cur.execute(
|
||||
"UPDATE files SET status=? WHERE id=?",
|
||||
("keeper" if is_k else "redundant", m["id"]),
|
||||
)
|
||||
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (gid,))
|
||||
resolved += 1
|
||||
|
||||
con.commit()
|
||||
con.close()
|
||||
return {"resolved": resolved}
|
||||
|
||||
|
||||
# ── Files + thumbnails ────────────────────────────────────────────────────────
|
||||
|
||||
VIDEO_PLACEHOLDER_SVG = """<svg xmlns="http://www.w3.org/2000/svg" width="200" height="200"
|
||||
viewBox="0 0 200 200">
|
||||
<rect width="200" height="200" fill="#1e1e2e"/>
|
||||
<polygon points="75,55 75,145 145,100" fill="#9b7de8"/>
|
||||
</svg>"""
|
||||
|
||||
VIDEO_EXT = {".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", ".wmv", ".mts", ".m2ts"}
|
||||
|
||||
|
||||
@app.get("/api/thumb/{file_id}")
|
||||
def get_thumb(file_id: int):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT path, mime_type, extension FROM files WHERE id=?", (file_id,))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
|
||||
if not row:
|
||||
raise HTTPException(404, "File not found")
|
||||
|
||||
path = row["path"]
|
||||
ext = (row["extension"] or "").lower()
|
||||
|
||||
if not os.path.isfile(path):
|
||||
raise HTTPException(404, "File not on disk")
|
||||
|
||||
if ext in VIDEO_EXT:
|
||||
# Try ffmpeg for first frame
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"ffmpeg", "-i", path,
|
||||
"-vframes", "1", "-f", "image2", "-vcodec", "mjpeg",
|
||||
"pipe:1",
|
||||
],
|
||||
capture_output=True, timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
return Response(content=result.stdout, media_type="image/jpeg")
|
||||
except Exception:
|
||||
pass
|
||||
return Response(content=VIDEO_PLACEHOLDER_SVG, media_type="image/svg+xml")
|
||||
|
||||
# Serve photo directly
|
||||
mime = row["mime_type"] or "application/octet-stream"
|
||||
return FileResponse(path, media_type=mime)
|
||||
|
||||
|
||||
@app.get("/api/files/{file_id}")
|
||||
def get_file_meta(file_id: int):
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT * FROM files WHERE id=?", (file_id,))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
if not row:
|
||||
raise HTTPException(404, "File not found")
|
||||
return dict(row)
|
||||
|
||||
|
||||
# ── Stats ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/api/stats")
|
||||
def get_stats():
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status != 'error'")
|
||||
r = cur.fetchone()
|
||||
total_files = r[0] or 0
|
||||
total_size = r[1] or 0
|
||||
|
||||
cur.execute("""
|
||||
SELECT COUNT(*), SUM(f.file_size)
|
||||
FROM files f
|
||||
JOIN duplicate_members dm ON dm.file_id = f.id
|
||||
WHERE dm.is_keeper = 0
|
||||
""")
|
||||
r = cur.fetchone()
|
||||
dup_files = r[0] or 0
|
||||
dup_size = r[1] or 0
|
||||
|
||||
by_method = {}
|
||||
for method in ("sha256", "phash", "exif", "filesize"):
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
|
||||
groups = cur.fetchone()[0]
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method = ?
|
||||
""", (method,))
|
||||
files = cur.fetchone()[0]
|
||||
by_method[method] = {"groups": groups, "files": files}
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
|
||||
reviewed = cur.fetchone()[0]
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
|
||||
pending = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM files WHERE is_takeout=1")
|
||||
takeout_files = cur.fetchone()[0]
|
||||
|
||||
con.close()
|
||||
return {
|
||||
"total_files": total_files,
|
||||
"total_size_bytes": total_size,
|
||||
"duplicate_files": dup_files,
|
||||
"duplicate_size_bytes": dup_size,
|
||||
"groups_by_method": by_method,
|
||||
"reviewed": reviewed,
|
||||
"pending": pending,
|
||||
"takeout_files": takeout_files,
|
||||
}
|
||||
|
||||
|
||||
# ── Export ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/api/export/csv")
|
||||
def export_csv():
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT dg.id as group_id, dg.method, f.id as file_id,
|
||||
f.path, f.filename, f.file_size,
|
||||
f.width, f.height, f.exif_datetime, f.exif_device,
|
||||
dm.is_keeper,
|
||||
CASE WHEN dm.is_keeper=0 AND dg.reviewed=1 THEN 1 ELSE 0 END as is_redundant,
|
||||
dg.reviewed
|
||||
FROM duplicate_groups dg
|
||||
JOIN duplicate_members dm ON dm.group_id = dg.id
|
||||
JOIN files f ON f.id = dm.file_id
|
||||
ORDER BY dg.id, dm.is_keeper DESC
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
con.close()
|
||||
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
writer.writerow([
|
||||
"group_id", "method", "file_id", "path", "filename",
|
||||
"size", "width", "height", "exif_date", "device",
|
||||
"is_keeper", "is_redundant", "reviewed",
|
||||
])
|
||||
for r in rows:
|
||||
writer.writerow([
|
||||
r["group_id"], r["method"], r["file_id"],
|
||||
r["path"], r["filename"], r["file_size"],
|
||||
r["width"], r["height"], r["exif_datetime"], r["exif_device"],
|
||||
r["is_keeper"], r["is_redundant"], r["reviewed"],
|
||||
])
|
||||
|
||||
output.seek(0)
|
||||
return StreamingResponse(
|
||||
iter([output.getvalue()]),
|
||||
media_type="text/csv",
|
||||
headers={"Content-Disposition": "attachment; filename=dup-finder-export.csv"},
|
||||
)
|
||||
758
app/scanner.py
Normal file
758
app/scanner.py
Normal file
@@ -0,0 +1,758 @@
|
||||
"""
|
||||
File scanner: discovery, per-file extraction, and all 4 duplicate detection passes.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import imagehash
|
||||
from PIL import Image, ExifTags, UnidentifiedImageError
|
||||
|
||||
try:
|
||||
from pillow_heif import register_heif_opener
|
||||
register_heif_opener()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from takeout import is_takeout_folder, process_takeout
|
||||
|
||||
|
||||
PHOTO_EXT = {
|
||||
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif",
|
||||
".webp", ".heic", ".heif", ".raw", ".cr2", ".nef", ".arw",
|
||||
".dng", ".orf", ".rw2", ".pef", ".srw", ".x3f",
|
||||
}
|
||||
|
||||
VIDEO_EXT = {
|
||||
".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp",
|
||||
".wmv", ".mts", ".m2ts",
|
||||
}
|
||||
|
||||
SUPPORTED_EXT = PHOTO_EXT | VIDEO_EXT
|
||||
|
||||
DB_PATH = "/data/dupfinder.db"
|
||||
|
||||
# Shared scan state (updated by background thread, read by status endpoint)
|
||||
scan_state = {
|
||||
"scan_id": None,
|
||||
"status": "idle", # idle | running | complete | error | cancelled
|
||||
"phase": "idle", # discovery | takeout | indexing | phash | grouping | done
|
||||
"progress": 0,
|
||||
"total": 0,
|
||||
"message": "",
|
||||
"cancel_requested": False,
|
||||
"stats": {},
|
||||
}
|
||||
|
||||
|
||||
# ── DB helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_db() -> sqlite3.Connection:
|
||||
con = sqlite3.connect(DB_PATH, timeout=30)
|
||||
con.row_factory = sqlite3.Row
|
||||
con.execute("PRAGMA journal_mode=WAL")
|
||||
con.execute("PRAGMA foreign_keys=ON")
|
||||
return con
|
||||
|
||||
|
||||
def init_db():
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
path TEXT UNIQUE NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
extension TEXT,
|
||||
file_size INTEGER,
|
||||
mime_type TEXT,
|
||||
sha256 TEXT,
|
||||
phash TEXT,
|
||||
exif_datetime TEXT,
|
||||
exif_device TEXT,
|
||||
width INTEGER,
|
||||
height INTEGER,
|
||||
is_takeout INTEGER DEFAULT 0,
|
||||
is_edited INTEGER DEFAULT 0,
|
||||
takeout_json TEXT,
|
||||
scan_id INTEGER,
|
||||
status TEXT DEFAULT 'pending',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS scans (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
folder_path TEXT NOT NULL,
|
||||
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
total_files INTEGER DEFAULT 0,
|
||||
status TEXT DEFAULT 'running'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS duplicate_groups (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
method TEXT NOT NULL,
|
||||
method_value TEXT,
|
||||
reviewed INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS duplicate_members (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
group_id INTEGER REFERENCES duplicate_groups(id) ON DELETE CASCADE,
|
||||
file_id INTEGER REFERENCES files(id) ON DELETE CASCADE,
|
||||
is_keeper INTEGER DEFAULT 0,
|
||||
suggested INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sha256 ON files(sha256);
|
||||
CREATE INDEX IF NOT EXISTS idx_phash ON files(phash);
|
||||
CREATE INDEX IF NOT EXISTS idx_exif_dt ON files(exif_datetime, exif_device);
|
||||
CREATE INDEX IF NOT EXISTS idx_size_dim ON files(file_size, width, height);
|
||||
CREATE INDEX IF NOT EXISTS idx_status ON files(status);
|
||||
""")
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
|
||||
# ── Per-file extraction ───────────────────────────────────────────────────────
|
||||
|
||||
def _sha256(path: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _exif_data(path: str) -> tuple[str | None, str | None]:
|
||||
"""Returns (exif_datetime, exif_device) or (None, None)."""
|
||||
try:
|
||||
img = Image.open(path)
|
||||
exif_raw = img._getexif()
|
||||
if not exif_raw:
|
||||
return None, None
|
||||
exif = {ExifTags.TAGS.get(k, k): v for k, v in exif_raw.items()}
|
||||
dt = exif.get("DateTimeOriginal") or exif.get("DateTime")
|
||||
if dt:
|
||||
try:
|
||||
dt = datetime.strptime(dt, "%Y:%m:%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
except ValueError:
|
||||
dt = None
|
||||
make = str(exif.get("Make", "")).strip()
|
||||
model = str(exif.get("Model", "")).strip()
|
||||
device = (make + " " + model).strip() if (make or model) else None
|
||||
return dt, device
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def _image_dims(path: str) -> tuple[int | None, int | None]:
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
return img.size # (width, height)
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def _phash(path: str) -> str | None:
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _video_dims(path: str) -> tuple[int | None, int | None]:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"ffprobe", "-v", "error",
|
||||
"-select_streams", "v:0",
|
||||
"-show_entries", "stream=width,height",
|
||||
"-of", "csv=p=0",
|
||||
path,
|
||||
],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
parts = result.stdout.strip().split(",")
|
||||
if len(parts) == 2:
|
||||
return int(parts[0]), int(parts[1])
|
||||
except Exception:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
|
||||
def _mtime_str(path: str) -> str | None:
|
||||
try:
|
||||
ts = os.path.getmtime(path)
|
||||
return datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_file(path: str) -> dict:
|
||||
ext = Path(path).suffix.lower()
|
||||
filename = Path(path).name
|
||||
is_photo = ext in PHOTO_EXT
|
||||
is_video = ext in VIDEO_EXT
|
||||
|
||||
record = {
|
||||
"path": path,
|
||||
"filename": filename,
|
||||
"extension": ext,
|
||||
"file_size": None,
|
||||
"mime_type": None,
|
||||
"sha256": None,
|
||||
"phash": None,
|
||||
"exif_datetime": None,
|
||||
"exif_device": None,
|
||||
"width": None,
|
||||
"height": None,
|
||||
}
|
||||
|
||||
try:
|
||||
record["file_size"] = os.path.getsize(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
record["mime_type"] = mimetypes.guess_type(path)[0]
|
||||
|
||||
try:
|
||||
record["sha256"] = _sha256(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if is_photo:
|
||||
w, h = _image_dims(path)
|
||||
record["width"], record["height"] = w, h
|
||||
dt, device = _exif_data(path)
|
||||
record["exif_datetime"] = dt or _mtime_str(path)
|
||||
record["exif_device"] = device
|
||||
# phash computed in separate phase for progress reporting
|
||||
|
||||
elif is_video:
|
||||
w, h = _video_dims(path)
|
||||
record["width"], record["height"] = w, h
|
||||
record["exif_datetime"] = _mtime_str(path)
|
||||
|
||||
return record
|
||||
|
||||
|
||||
# ── Union-Find for phash grouping ────────────────────────────────────────────
|
||||
|
||||
class UnionFind:
|
||||
def __init__(self):
|
||||
self.parent: dict[int, int] = {}
|
||||
|
||||
def find(self, x: int) -> int:
|
||||
if x not in self.parent:
|
||||
self.parent[x] = x
|
||||
if self.parent[x] != x:
|
||||
self.parent[x] = self.find(self.parent[x])
|
||||
return self.parent[x]
|
||||
|
||||
def union(self, x: int, y: int):
|
||||
px, py = self.find(x), self.find(y)
|
||||
if px != py:
|
||||
self.parent[px] = py
|
||||
|
||||
def groups(self) -> dict[int, list[int]]:
|
||||
from collections import defaultdict
|
||||
result: dict[int, list[int]] = defaultdict(list)
|
||||
for x in self.parent:
|
||||
result[self.find(x)].append(x)
|
||||
return {k: v for k, v in result.items() if len(v) >= 2}
|
||||
|
||||
|
||||
# ── Detection passes ──────────────────────────────────────────────────────────
|
||||
|
||||
def _suggested_keeper_by_resolution(members: list[dict]) -> int:
|
||||
"""Return file_id of highest resolution member; tie-break by size then oldest date."""
|
||||
def score(m):
|
||||
w = m["width"] or 0
|
||||
h = m["height"] or 0
|
||||
size = m["file_size"] or 0
|
||||
dt = m["exif_datetime"] or "9999"
|
||||
return (w * h, size, dt)
|
||||
|
||||
best = max(members, key=lambda m: (
|
||||
(m["width"] or 0) * (m["height"] or 0),
|
||||
m["file_size"] or 0,
|
||||
# older date = better; invert by negating epoch or use str comparison inverted
|
||||
))
|
||||
return best["id"]
|
||||
|
||||
|
||||
def _suggested_keeper_oldest(members: list[dict]) -> int:
|
||||
def key(m):
|
||||
return m["exif_datetime"] or "9999"
|
||||
return min(members, key=key)["id"]
|
||||
|
||||
|
||||
def _run_sha256_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT sha256, COUNT(*) as cnt
|
||||
FROM files
|
||||
WHERE sha256 IS NOT NULL
|
||||
GROUP BY sha256
|
||||
HAVING cnt > 1
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
sha = row["sha256"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
FROM files WHERE sha256 = ?
|
||||
""", (sha,))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('sha256', ?)",
|
||||
(sha,),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
def _run_phash_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
# Exclude files already in sha256 groups
|
||||
cur.execute("""
|
||||
SELECT f.id, f.phash, f.width, f.height, f.file_size, f.exif_datetime
|
||||
FROM files f
|
||||
WHERE f.phash IS NOT NULL
|
||||
AND f.extension NOT IN (
|
||||
'.mp4','.mov','.avi','.mkv','.m4v','.3gp','.wmv','.mts','.m2ts'
|
||||
)
|
||||
AND f.id NOT IN (
|
||||
SELECT dm.file_id FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method = 'sha256'
|
||||
)
|
||||
""")
|
||||
rows = [dict(r) for r in cur.fetchall()]
|
||||
|
||||
if len(rows) < 2:
|
||||
return
|
||||
|
||||
# Bucket by first 2 hex chars to reduce O(n²) comparisons
|
||||
buckets: dict[str, list[dict]] = {}
|
||||
for r in rows:
|
||||
key = r["phash"][:2]
|
||||
buckets.setdefault(key, []).append(r)
|
||||
|
||||
uf = UnionFind()
|
||||
# Ensure all IDs are registered
|
||||
for r in rows:
|
||||
uf.find(r["id"])
|
||||
|
||||
THRESHOLD = 10
|
||||
for bucket in buckets.values():
|
||||
for i in range(len(bucket)):
|
||||
for j in range(i + 1, len(bucket)):
|
||||
a, b = bucket[i], bucket[j]
|
||||
try:
|
||||
dist = imagehash.hex_to_hash(a["phash"]) - imagehash.hex_to_hash(b["phash"])
|
||||
if dist <= THRESHOLD:
|
||||
uf.union(a["id"], b["id"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
id_map = {r["id"]: r for r in rows}
|
||||
for _, member_ids in uf.groups().items():
|
||||
members = [id_map[mid] for mid in member_ids if mid in id_map]
|
||||
if len(members) < 2:
|
||||
continue
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
keeper = id_map[keeper_id]
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('phash', ?)",
|
||||
(keeper["phash"],),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
def _run_exif_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT exif_datetime, exif_device, COUNT(*) as cnt
|
||||
FROM files
|
||||
WHERE exif_datetime IS NOT NULL
|
||||
AND exif_device IS NOT NULL
|
||||
AND id NOT IN (
|
||||
SELECT file_id FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method IN ('sha256', 'phash')
|
||||
)
|
||||
GROUP BY exif_datetime, exif_device
|
||||
HAVING cnt > 1
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
dt, dev = row["exif_datetime"], row["exif_device"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
FROM files
|
||||
WHERE exif_datetime = ? AND exif_device = ?
|
||||
""", (dt, dev))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
keeper_id = _suggested_keeper_by_resolution(members)
|
||||
method_value = f"{dt}::{dev}"
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('exif', ?)",
|
||||
(method_value,),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
def _run_filesize_pass(con: sqlite3.Connection, scan_id: int):
|
||||
cur = con.cursor()
|
||||
cur.execute("""
|
||||
SELECT file_size, width, height, COUNT(*) as cnt
|
||||
FROM files
|
||||
WHERE file_size IS NOT NULL
|
||||
AND width IS NOT NULL
|
||||
AND height IS NOT NULL
|
||||
AND id NOT IN (
|
||||
SELECT file_id FROM duplicate_members dm
|
||||
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
||||
WHERE dg.method IN ('sha256', 'phash', 'exif')
|
||||
)
|
||||
GROUP BY file_size, width, height
|
||||
HAVING cnt > 1
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
fs, w, h = row["file_size"], row["width"], row["height"]
|
||||
cur.execute("""
|
||||
SELECT id, width, height, file_size, exif_datetime
|
||||
FROM files
|
||||
WHERE file_size = ? AND width = ? AND height = ?
|
||||
""", (fs, w, h))
|
||||
members = [dict(r) for r in cur.fetchall()]
|
||||
keeper_id = _suggested_keeper_oldest(members)
|
||||
method_value = f"{fs}::{w}x{h}"
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_groups (method, method_value) VALUES ('filesize', ?)",
|
||||
(method_value,),
|
||||
)
|
||||
group_id = cur.lastrowid
|
||||
for m in members:
|
||||
cur.execute(
|
||||
"INSERT INTO duplicate_members (group_id, file_id, suggested) VALUES (?, ?, ?)",
|
||||
(group_id, m["id"], 1 if m["id"] == keeper_id else 0),
|
||||
)
|
||||
|
||||
|
||||
# ── Main scan entry point ─────────────────────────────────────────────────────
|
||||
|
||||
def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
"""Main scan function — runs in background thread."""
|
||||
global scan_state
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
|
||||
try:
|
||||
# ── Phase: discovery ──────────────────────────────────────────────
|
||||
scan_state.update(phase="discovery", progress=0, total=0,
|
||||
message="Discovering files...")
|
||||
|
||||
all_files = []
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
||||
for fname in files:
|
||||
if fname.endswith(".json"):
|
||||
continue
|
||||
ext = Path(fname).suffix.lower()
|
||||
if ext in SUPPORTED_EXT:
|
||||
all_files.append(os.path.join(root, fname))
|
||||
|
||||
scan_state["total"] = len(all_files)
|
||||
scan_state["message"] = f"Found {len(all_files):,} files"
|
||||
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
# ── Mode: full reset ──────────────────────────────────────────────
|
||||
if mode == "full_reset":
|
||||
cur.execute("DELETE FROM duplicate_members")
|
||||
cur.execute("DELETE FROM duplicate_groups")
|
||||
cur.execute("DELETE FROM files")
|
||||
con.commit()
|
||||
|
||||
# ── Phase: takeout pre-processing ─────────────────────────────────
|
||||
scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
|
||||
if is_takeout_folder(folder_path):
|
||||
scan_state["message"] = "Processing Google Takeout sidecars..."
|
||||
process_takeout(folder_path, DB_PATH)
|
||||
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
# ── Phase: indexing ───────────────────────────────────────────────
|
||||
scan_state.update(phase="indexing", progress=0,
|
||||
message="Indexing files (SHA-256 + EXIF + dimensions)...")
|
||||
|
||||
for i, path in enumerate(all_files):
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
scan_state["progress"] = i + 1
|
||||
scan_state["message"] = f"Indexing: {Path(path).name}"
|
||||
|
||||
# Check existing record
|
||||
cur.execute("SELECT id, file_size, updated_at FROM files WHERE path = ?", (path,))
|
||||
existing = cur.fetchone()
|
||||
|
||||
try:
|
||||
current_size = os.path.getsize(path)
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if existing and mode in ("incremental", "new_files"):
|
||||
if mode == "new_files":
|
||||
# Skip entirely — don't re-hash existing files
|
||||
cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
|
||||
continue
|
||||
# Incremental: skip if size unchanged (use size as proxy for change)
|
||||
if existing["file_size"] == current_size:
|
||||
cur.execute("UPDATE files SET scan_id = ? WHERE path = ?", (scan_id, path))
|
||||
continue
|
||||
# File changed — re-hash, clear group memberships
|
||||
cur.execute(
|
||||
"DELETE FROM duplicate_members WHERE file_id = ?", (existing["id"],)
|
||||
)
|
||||
|
||||
try:
|
||||
record = extract_file(path)
|
||||
except Exception as e:
|
||||
cur.execute(
|
||||
"INSERT OR IGNORE INTO files (path, filename, extension, scan_id, status) "
|
||||
"VALUES (?, ?, ?, ?, 'error')",
|
||||
(path, Path(path).name, Path(path).suffix.lower(), scan_id),
|
||||
)
|
||||
cur.execute(
|
||||
"UPDATE files SET status='error', scan_id=?, updated_at=CURRENT_TIMESTAMP "
|
||||
"WHERE path=?",
|
||||
(scan_id, path),
|
||||
)
|
||||
con.commit()
|
||||
continue
|
||||
|
||||
record["scan_id"] = scan_id
|
||||
if existing:
|
||||
cur.execute("""
|
||||
UPDATE files SET
|
||||
filename=:filename, extension=:extension, file_size=:file_size,
|
||||
mime_type=:mime_type, sha256=:sha256,
|
||||
exif_datetime=:exif_datetime, exif_device=:exif_device,
|
||||
width=:width, height=:height, scan_id=:scan_id,
|
||||
status='pending', updated_at=CURRENT_TIMESTAMP
|
||||
WHERE path=:path
|
||||
""", record)
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT OR IGNORE INTO files
|
||||
(path, filename, extension, file_size, mime_type, sha256,
|
||||
exif_datetime, exif_device, width, height, scan_id, status)
|
||||
VALUES
|
||||
(:path, :filename, :extension, :file_size, :mime_type, :sha256,
|
||||
:exif_datetime, :exif_device, :width, :height, :scan_id, 'pending')
|
||||
""", record)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
con.commit()
|
||||
|
||||
con.commit()
|
||||
|
||||
# ── Phase: phash ──────────────────────────────────────────────────
|
||||
scan_state.update(phase="phash", progress=0,
|
||||
message="Computing perceptual hashes...")
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, path FROM files
|
||||
WHERE extension IN (
|
||||
'.jpg','.jpeg','.png','.gif','.bmp','.tiff','.tif',
|
||||
'.webp','.heic','.heif','.raw','.cr2','.nef','.arw',
|
||||
'.dng','.orf','.rw2','.pef','.srw','.x3f'
|
||||
) AND phash IS NULL AND status != 'error'
|
||||
""")
|
||||
photo_rows = cur.fetchall()
|
||||
scan_state["total"] = len(photo_rows)
|
||||
|
||||
for i, row in enumerate(photo_rows):
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
|
||||
scan_state["progress"] = i + 1
|
||||
scan_state["message"] = f"Phash: {Path(row['path']).name}"
|
||||
ph = _phash(row["path"])
|
||||
if ph:
|
||||
cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
|
||||
if (i + 1) % 200 == 0:
|
||||
con.commit()
|
||||
|
||||
con.commit()
|
||||
|
||||
# ── Phase: grouping ───────────────────────────────────────────────
|
||||
scan_state.update(phase="grouping", progress=0, total=4,
|
||||
message="Running duplicate detection...")
|
||||
|
||||
if mode in ("incremental", "full_reset", "regroup"):
|
||||
cur.execute("DELETE FROM duplicate_members")
|
||||
cur.execute("DELETE FROM duplicate_groups")
|
||||
con.commit()
|
||||
elif mode == "new_files":
|
||||
# Only clear groups containing new files
|
||||
cur.execute("""
|
||||
DELETE FROM duplicate_groups WHERE id IN (
|
||||
SELECT DISTINCT dm.group_id FROM duplicate_members dm
|
||||
JOIN files f ON f.id = dm.file_id
|
||||
WHERE f.scan_id = ?
|
||||
)
|
||||
""", (scan_id,))
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 1/4: SHA-256 exact duplicates..."
|
||||
_run_sha256_pass(con, scan_id)
|
||||
scan_state["progress"] = 1
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 2/4: Perceptual hash similarity..."
|
||||
_run_phash_pass(con, scan_id)
|
||||
scan_state["progress"] = 2
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 3/4: EXIF timestamp + device..."
|
||||
_run_exif_pass(con, scan_id)
|
||||
scan_state["progress"] = 3
|
||||
con.commit()
|
||||
|
||||
scan_state["message"] = "Pass 4/4: File size + dimensions..."
|
||||
_run_filesize_pass(con, scan_id)
|
||||
scan_state["progress"] = 4
|
||||
con.commit()
|
||||
|
||||
# ── Restore keeper statuses for mode=incremental ──────────────────
|
||||
if mode == "incremental":
|
||||
# If a previously marked keeper no longer appears in any group, reset to pending
|
||||
cur.execute("""
|
||||
UPDATE files SET status='pending'
|
||||
WHERE status='keeper'
|
||||
AND id NOT IN (
|
||||
SELECT file_id FROM duplicate_members WHERE is_keeper=1
|
||||
)
|
||||
""")
|
||||
con.commit()
|
||||
|
||||
# Update scan record
|
||||
cur.execute(
|
||||
"UPDATE scans SET completed_at=CURRENT_TIMESTAMP, total_files=?, status='complete' "
|
||||
"WHERE id=?",
|
||||
(len(all_files), scan_id),
|
||||
)
|
||||
con.commit()
|
||||
|
||||
scan_state.update(status="complete", phase="done",
|
||||
message="Scan complete.", progress=scan_state["total"])
|
||||
_update_stats()
|
||||
|
||||
except Exception as e:
|
||||
scan_state.update(status="error", message=str(e))
|
||||
try:
|
||||
_mark_scan(cur, scan_id, "error")
|
||||
con.commit()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
def _mark_scan(cur, scan_id: int, status: str):
|
||||
cur.execute(
|
||||
"UPDATE scans SET completed_at=CURRENT_TIMESTAMP, status=? WHERE id=?",
|
||||
(status, scan_id),
|
||||
)
|
||||
|
||||
|
||||
def _update_stats():
|
||||
"""Refresh stats in scan_state."""
|
||||
try:
|
||||
con = get_db()
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
|
||||
total_files = cur.fetchone()[0]
|
||||
|
||||
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status='redundant'")
|
||||
r = cur.fetchone()
|
||||
dup_count = r[0] or 0
|
||||
dup_size = r[1] or 0
|
||||
|
||||
for method in ("sha256", "phash", "exif", "filesize"):
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,)
|
||||
)
|
||||
cur.execute("""
|
||||
SELECT method,
|
||||
COUNT(*) as groups,
|
||||
(SELECT COUNT(*) FROM duplicate_members dm2
|
||||
JOIN duplicate_groups dg2 ON dg2.id=dm2.group_id
|
||||
WHERE dg2.method=dg.method) as files
|
||||
FROM duplicate_groups dg
|
||||
GROUP BY method
|
||||
""")
|
||||
by_method = {r["method"]: {"groups": r["groups"], "files": r["files"]}
|
||||
for r in cur.fetchall()}
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
|
||||
reviewed = cur.fetchone()[0]
|
||||
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
|
||||
pending = cur.fetchone()[0]
|
||||
|
||||
scan_state["stats"] = {
|
||||
"total_files": total_files,
|
||||
"duplicate_files": dup_count,
|
||||
"duplicate_size_bytes": dup_size,
|
||||
"groups_by_method": by_method,
|
||||
"reviewed": reviewed,
|
||||
"pending": pending,
|
||||
}
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
149
app/takeout.py
Normal file
149
app/takeout.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Google Takeout pre-processor.
|
||||
Detects Takeout folder structures, reads JSON sidecars, and enriches
|
||||
the files table with corrected timestamps, normalized filenames, and
|
||||
edit-version flags.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Google edit suffixes appended to filenames
|
||||
EDIT_SUFFIXES = ("-edited", "-effects", "-smile", "-mix")
|
||||
|
||||
|
||||
def _find_sidecar(media_path: str) -> str | None:
|
||||
"""Return path to the JSON sidecar for a media file, or None."""
|
||||
p = Path(media_path)
|
||||
# Try filename.ext.json first, then filename.json
|
||||
candidates = [
|
||||
str(p) + ".json",
|
||||
str(p.with_suffix(".json")),
|
||||
]
|
||||
for c in candidates:
|
||||
if os.path.isfile(c):
|
||||
return c
|
||||
return None
|
||||
|
||||
|
||||
def _strip_collision_suffix(filename: str) -> str:
|
||||
"""Strip Google's (1), (2) collision suffixes from a filename."""
|
||||
stem = Path(filename).stem
|
||||
ext = Path(filename).suffix
|
||||
cleaned = re.sub(r"\(\d+\)$", "", stem).rstrip()
|
||||
return cleaned + ext
|
||||
|
||||
|
||||
def _is_edited(filename: str) -> bool:
|
||||
stem = Path(filename).stem.lower()
|
||||
return any(stem.endswith(s) for s in EDIT_SUFFIXES)
|
||||
|
||||
|
||||
def is_takeout_folder(folder_path: str) -> bool:
|
||||
"""
|
||||
Heuristic: walk folder looking for .json files whose names match
|
||||
adjacent media files. If we find at least 5 such pairs, call it Takeout.
|
||||
"""
|
||||
count = 0
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
# Skip hidden dirs
|
||||
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
||||
file_set = set(files)
|
||||
for f in files:
|
||||
if not f.endswith(".json"):
|
||||
continue
|
||||
# Check if a media file exists that this could be a sidecar for
|
||||
base = f[:-5] # strip .json
|
||||
if base in file_set:
|
||||
count += 1
|
||||
if count >= 5:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def process_takeout(folder_path: str, db_path: str) -> int:
|
||||
"""
|
||||
Walk folder_path, find all media files with JSON sidecars,
|
||||
and enrich their DB records. Returns count of files enriched.
|
||||
"""
|
||||
con = sqlite3.connect(db_path)
|
||||
con.row_factory = sqlite3.Row
|
||||
cur = con.cursor()
|
||||
|
||||
enriched = 0
|
||||
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
||||
for fname in files:
|
||||
if fname.endswith(".json"):
|
||||
continue
|
||||
media_path = os.path.join(root, fname)
|
||||
sidecar = _find_sidecar(media_path)
|
||||
if not sidecar:
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(sidecar, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
# Extract fields from sidecar
|
||||
photo_taken_ts = None
|
||||
try:
|
||||
ts = int(data["photoTakenTime"]["timestamp"])
|
||||
dt = datetime.fromtimestamp(ts, tz=timezone.utc)
|
||||
photo_taken_ts = dt.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
except (KeyError, ValueError, TypeError):
|
||||
pass
|
||||
|
||||
title = data.get("title", "")
|
||||
takeout_json_str = json.dumps(data)
|
||||
|
||||
# Normalized filename: use title if present, else strip suffix from fname
|
||||
if title:
|
||||
normalized = _strip_collision_suffix(title)
|
||||
else:
|
||||
normalized = _strip_collision_suffix(fname)
|
||||
|
||||
edited = _is_edited(fname)
|
||||
|
||||
# Update the DB record for this file
|
||||
updates = {
|
||||
"is_takeout": 1,
|
||||
"filename": normalized,
|
||||
"takeout_json": takeout_json_str,
|
||||
}
|
||||
if photo_taken_ts:
|
||||
updates["exif_datetime"] = photo_taken_ts
|
||||
|
||||
set_clause = ", ".join(f"{k} = ?" for k in updates)
|
||||
values = list(updates.values()) + [media_path]
|
||||
|
||||
cur.execute(
|
||||
f"UPDATE files SET {set_clause}, updated_at = CURRENT_TIMESTAMP "
|
||||
f"WHERE path = ?",
|
||||
values,
|
||||
)
|
||||
|
||||
# Handle edited flag — add is_edited column if needed (migration-safe)
|
||||
if edited:
|
||||
try:
|
||||
cur.execute(
|
||||
"UPDATE files SET is_edited = 1 WHERE path = ?",
|
||||
(media_path,),
|
||||
)
|
||||
except sqlite3.OperationalError:
|
||||
pass # column doesn't exist yet, skip
|
||||
|
||||
if cur.rowcount > 0:
|
||||
enriched += 1
|
||||
|
||||
con.commit()
|
||||
con.close()
|
||||
return enriched
|
||||
Reference in New Issue
Block a user