Use QUOTE_ALL + sanitise NUL/CR/LF in path/filename/exif fields. Default csv dialect rejected fields containing line terminators with 'need to escape, but no escapechar set'. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
722 lines
23 KiB
Python
722 lines
23 KiB
Python
"""
|
|
FastAPI application — all API routes for the duplicate finder.
|
|
"""
|
|
|
|
import csv
|
|
import io
|
|
import os
|
|
import sqlite3
|
|
import subprocess
|
|
import threading
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, HTTPException, Query, Request
|
|
from fastapi.responses import (
|
|
FileResponse, JSONResponse, Response, StreamingResponse
|
|
)
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.templating import Jinja2Templates
|
|
from pydantic import BaseModel
|
|
|
|
import scanner as sc
|
|
|
|
app = FastAPI(title="Duplicate Finder")
|
|
|
|
# Resolve paths relative to this file so it works both in Docker and locally
|
|
_BASE = Path(__file__).parent
|
|
_TEMPLATES_DIR = (
|
|
str(_BASE / "templates") if (_BASE / "templates").exists()
|
|
else str(_BASE.parent / "templates") if (_BASE.parent / "templates").exists()
|
|
else "/app/templates"
|
|
)
|
|
_STATIC_DIR = str(_BASE / "static")
|
|
_STATIC_DIR = _STATIC_DIR if Path(_STATIC_DIR).exists() else "/app/static"
|
|
# Ensure static dir exists
|
|
Path(_STATIC_DIR).mkdir(parents=True, exist_ok=True)
|
|
|
|
templates = Jinja2Templates(directory=_TEMPLATES_DIR)
|
|
|
|
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
|
|
|
METHOD_META = {
|
|
"sha256": {"color": "#378ADD", "label": "Exact copy"},
|
|
"phash": {"color": "#9b7de8", "label": "Visual match"},
|
|
"exif": {"color": "#e2a43a", "label": "Same moment"},
|
|
"filesize": {"color": "#888780", "label": "Possible match"},
|
|
}
|
|
|
|
# ── Startup ───────────────────────────────────────────────────────────────────
|
|
|
|
@app.on_event("startup")
|
|
def startup():
|
|
sc.init_db()
|
|
|
|
|
|
# ── Frontend ──────────────────────────────────────────────────────────────────
|
|
|
|
@app.get("/")
|
|
def index(request: Request):
|
|
return templates.TemplateResponse("index.html", {"request": request})
|
|
|
|
|
|
# ── DB helper ─────────────────────────────────────────────────────────────────
|
|
|
|
def get_db() -> sqlite3.Connection:
|
|
return sc.get_db()
|
|
|
|
|
|
# ── Scan management ───────────────────────────────────────────────────────────
|
|
|
|
class ScanStartBody(BaseModel):
|
|
folder_path: str
|
|
mode: str = "incremental"
|
|
|
|
|
|
@app.post("/api/scan/start")
|
|
def scan_start(body: ScanStartBody):
|
|
if sc.scan_state["status"] == "running":
|
|
raise HTTPException(400, "A scan is already running")
|
|
|
|
mode = body.mode
|
|
if mode not in ("incremental", "full_reset", "new_files", "regroup"):
|
|
raise HTTPException(400, f"Unknown scan mode: {mode}")
|
|
|
|
if mode == "full_reset":
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("DELETE FROM duplicate_members")
|
|
cur.execute("DELETE FROM duplicate_groups")
|
|
cur.execute("DELETE FROM files")
|
|
cur.execute("DELETE FROM scans")
|
|
con.commit()
|
|
con.close()
|
|
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute(
|
|
"INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
|
|
(body.folder_path,),
|
|
)
|
|
scan_id = cur.lastrowid
|
|
con.commit()
|
|
con.close()
|
|
|
|
sc.scan_state.update(
|
|
scan_id=scan_id,
|
|
status="running",
|
|
phase="takeout",
|
|
progress=0,
|
|
total=0,
|
|
message="Starting...",
|
|
pause_requested=False,
|
|
files_indexed=0,
|
|
phashes_done=0,
|
|
folder_path=body.folder_path,
|
|
stats={},
|
|
)
|
|
|
|
thread = threading.Thread(
|
|
target=sc.run_scan,
|
|
args=(body.folder_path, scan_id, mode),
|
|
daemon=True,
|
|
)
|
|
thread.start()
|
|
|
|
return {"scan_id": scan_id}
|
|
|
|
|
|
@app.get("/api/scan/status")
|
|
def scan_status():
|
|
state = sc.scan_state
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
|
|
# Build group counts per method
|
|
stats = {}
|
|
for method in ("sha256", "phash", "exif", "filesize"):
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
|
|
stats[f"{method}_groups"] = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups")
|
|
stats["total_groups"] = cur.fetchone()[0]
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
|
|
stats["reviewed"] = cur.fetchone()[0]
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
|
|
stats["pending"] = cur.fetchone()[0]
|
|
cur.execute("SELECT COUNT(*) FROM files WHERE status != 'error'")
|
|
stats["total_files"] = cur.fetchone()[0]
|
|
con.close()
|
|
|
|
return {
|
|
"scan_id": state["scan_id"],
|
|
"status": state["status"],
|
|
"phase": state["phase"],
|
|
"progress": state["progress"],
|
|
"total": state["total"],
|
|
"message": state["message"],
|
|
"folder_path": state.get("folder_path"),
|
|
"files_indexed": state.get("files_indexed", 0),
|
|
"phashes_done": state.get("phashes_done", 0),
|
|
"stats": stats,
|
|
}
|
|
|
|
|
|
@app.post("/api/scan/pause")
|
|
def scan_pause():
|
|
if sc.scan_state["status"] != "running":
|
|
raise HTTPException(400, "No scan is currently running")
|
|
sc.scan_state["pause_requested"] = True
|
|
return {"success": True}
|
|
|
|
|
|
# Keep /cancel as an alias so any lingering clients still work
|
|
@app.post("/api/scan/cancel")
|
|
def scan_cancel():
|
|
return scan_pause()
|
|
|
|
|
|
@app.post("/api/scan/resume")
|
|
def scan_resume():
|
|
if sc.scan_state["status"] != "paused":
|
|
raise HTTPException(400, "No paused scan to resume")
|
|
|
|
folder_path = sc.scan_state.get("folder_path")
|
|
if not folder_path:
|
|
raise HTTPException(400, "No folder path saved — please start a new scan")
|
|
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute(
|
|
"INSERT INTO scans (folder_path, status) VALUES (?, 'running')",
|
|
(folder_path,),
|
|
)
|
|
scan_id = cur.lastrowid
|
|
con.commit()
|
|
con.close()
|
|
|
|
sc.scan_state.update(
|
|
scan_id=scan_id,
|
|
status="running",
|
|
phase="takeout",
|
|
progress=0,
|
|
total=0,
|
|
message="Resuming scan...",
|
|
pause_requested=False,
|
|
files_indexed=0,
|
|
phashes_done=0,
|
|
folder_path=folder_path,
|
|
stats={},
|
|
)
|
|
|
|
thread = threading.Thread(
|
|
target=sc.run_scan,
|
|
args=(folder_path, scan_id, "incremental"),
|
|
daemon=True,
|
|
)
|
|
thread.start()
|
|
|
|
return {"scan_id": scan_id}
|
|
|
|
|
|
@app.delete("/api/scan/reset")
|
|
def scan_reset(confirm: str = Query("")):
|
|
if confirm != "RESET":
|
|
raise HTTPException(400, "Pass ?confirm=RESET to confirm")
|
|
if sc.scan_state["status"] == "running":
|
|
raise HTTPException(
|
|
400, "A scan is currently running — pause it before resetting"
|
|
)
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("DELETE FROM duplicate_members")
|
|
cur.execute("DELETE FROM duplicate_groups")
|
|
cur.execute("DELETE FROM files")
|
|
cur.execute("DELETE FROM scans")
|
|
con.commit()
|
|
con.close()
|
|
sc.scan_state.update(
|
|
scan_id=None, status="idle", phase="idle",
|
|
progress=0, total=0, message="",
|
|
pause_requested=False, files_indexed=0,
|
|
phashes_done=0, folder_path=None, stats={},
|
|
)
|
|
return {"success": True}
|
|
|
|
|
|
# ── Duplicate groups ──────────────────────────────────────────────────────────
|
|
|
|
@app.get("/api/groups")
|
|
def list_groups(
|
|
method: str = "all",
|
|
reviewed: str = "false",
|
|
sort: str = "count",
|
|
offset: int = 0,
|
|
limit: int = 50,
|
|
):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
|
|
where = []
|
|
params: list = []
|
|
|
|
if method != "all":
|
|
where.append("dg.method = ?")
|
|
params.append(method)
|
|
|
|
if reviewed == "false":
|
|
where.append("dg.reviewed = 0")
|
|
elif reviewed == "true":
|
|
where.append("dg.reviewed = 1")
|
|
|
|
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
|
|
|
|
order = {
|
|
"count": "member_count DESC",
|
|
"method": "dg.method, member_count DESC",
|
|
"date": "dg.created_at DESC",
|
|
}.get(sort, "member_count DESC")
|
|
|
|
cur.execute(f"""
|
|
SELECT COUNT(*) FROM duplicate_groups dg {where_clause}
|
|
""", params)
|
|
total = cur.fetchone()[0]
|
|
|
|
cur.execute(f"""
|
|
SELECT dg.id, dg.method, dg.reviewed,
|
|
COUNT(dm.id) as member_count,
|
|
(SELECT dm2.file_id FROM duplicate_members dm2
|
|
WHERE dm2.group_id = dg.id AND dm2.suggested = 1
|
|
LIMIT 1) as suggested_file_id
|
|
FROM duplicate_groups dg
|
|
LEFT JOIN duplicate_members dm ON dm.group_id = dg.id
|
|
{where_clause}
|
|
GROUP BY dg.id
|
|
ORDER BY {order}
|
|
LIMIT ? OFFSET ?
|
|
""", params + [limit, offset])
|
|
|
|
groups = []
|
|
for row in cur.fetchall():
|
|
meta = METHOD_META.get(row["method"], {"color": "#888", "label": row["method"]})
|
|
suggested = None
|
|
if row["suggested_file_id"]:
|
|
cur.execute(
|
|
"SELECT id, filename, width, height FROM files WHERE id=?",
|
|
(row["suggested_file_id"],),
|
|
)
|
|
f = cur.fetchone()
|
|
if f:
|
|
suggested = {
|
|
"file_id": f["id"],
|
|
"filename": f["filename"],
|
|
"width": f["width"],
|
|
"height": f["height"],
|
|
"thumb_url": f"/api/thumb/{f['id']}",
|
|
}
|
|
groups.append({
|
|
"id": row["id"],
|
|
"method": row["method"],
|
|
"method_color": meta["color"],
|
|
"method_label": meta["label"],
|
|
"member_count": row["member_count"],
|
|
"reviewed": bool(row["reviewed"]),
|
|
"suggested_keeper": suggested,
|
|
})
|
|
|
|
con.close()
|
|
return {"total": total, "groups": groups}
|
|
|
|
|
|
@app.get("/api/groups/{group_id}")
|
|
def get_group(group_id: int):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
|
|
cur.execute("SELECT * FROM duplicate_groups WHERE id=?", (group_id,))
|
|
grp = cur.fetchone()
|
|
if not grp:
|
|
raise HTTPException(404, "Group not found")
|
|
|
|
meta = METHOD_META.get(grp["method"], {"color": "#888", "label": grp["method"]})
|
|
|
|
cur.execute("""
|
|
SELECT f.id, f.filename, f.path, f.file_size, f.width, f.height,
|
|
f.mime_type, f.exif_datetime, f.exif_device,
|
|
f.is_takeout, f.is_edited,
|
|
dm.is_keeper, dm.suggested
|
|
FROM duplicate_members dm
|
|
JOIN files f ON f.id = dm.file_id
|
|
WHERE dm.group_id = ?
|
|
ORDER BY dm.suggested DESC, f.width * f.height DESC
|
|
""", (group_id,))
|
|
|
|
members = []
|
|
for r in cur.fetchall():
|
|
members.append({
|
|
"file_id": r["id"],
|
|
"filename": r["filename"],
|
|
"path": r["path"],
|
|
"file_size": r["file_size"],
|
|
"width": r["width"],
|
|
"height": r["height"],
|
|
"mime_type": r["mime_type"],
|
|
"exif_datetime": r["exif_datetime"],
|
|
"exif_device": r["exif_device"],
|
|
"is_takeout": bool(r["is_takeout"]),
|
|
"is_edited": bool(r["is_edited"]) if r["is_edited"] is not None else False,
|
|
"is_suggested": bool(r["suggested"]),
|
|
"is_keeper": bool(r["is_keeper"]),
|
|
"thumb_url": f"/api/thumb/{r['id']}",
|
|
})
|
|
|
|
con.close()
|
|
return {
|
|
"id": grp["id"],
|
|
"method": grp["method"],
|
|
"method_color": meta["color"],
|
|
"method_label": meta["label"],
|
|
"method_value": grp["method_value"],
|
|
"reviewed": bool(grp["reviewed"]),
|
|
"members": members,
|
|
}
|
|
|
|
|
|
# ── Decisions ─────────────────────────────────────────────────────────────────
|
|
|
|
class DecideBody(BaseModel):
|
|
keeper_file_id: int
|
|
|
|
|
|
@app.post("/api/groups/{group_id}/decide")
|
|
def decide(group_id: int, body: DecideBody):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
|
|
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
|
if not cur.fetchone():
|
|
raise HTTPException(404, "Group not found")
|
|
|
|
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
|
all_members = [r["file_id"] for r in cur.fetchall()]
|
|
|
|
if body.keeper_file_id not in all_members:
|
|
raise HTTPException(400, "keeper_file_id is not a member of this group")
|
|
|
|
for fid in all_members:
|
|
is_k = 1 if fid == body.keeper_file_id else 0
|
|
cur.execute(
|
|
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
|
|
(is_k, group_id, fid),
|
|
)
|
|
status = "keeper" if is_k else "redundant"
|
|
cur.execute("UPDATE files SET status=? WHERE id=?", (status, fid))
|
|
sc.log_decision(cur, fid, group_id, status, "manual")
|
|
|
|
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
|
|
con.commit()
|
|
con.close()
|
|
return {"success": True}
|
|
|
|
|
|
@app.post("/api/groups/{group_id}/skip")
|
|
def skip_group(group_id: int):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
|
if not cur.fetchone():
|
|
raise HTTPException(404, "Group not found")
|
|
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
|
for r in cur.fetchall():
|
|
sc.log_decision(cur, r["file_id"], group_id, "skip", "manual")
|
|
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
|
|
con.commit()
|
|
con.close()
|
|
return {"success": True}
|
|
|
|
|
|
@app.post("/api/groups/{group_id}/keep-all")
|
|
def keep_all(group_id: int):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
|
if not cur.fetchone():
|
|
raise HTTPException(404, "Group not found")
|
|
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
|
for r in cur.fetchall():
|
|
cur.execute(
|
|
"UPDATE duplicate_members SET is_keeper=1 WHERE group_id=? AND file_id=?",
|
|
(group_id, r["file_id"]),
|
|
)
|
|
cur.execute("UPDATE files SET status='keeper' WHERE id=?", (r["file_id"],))
|
|
sc.log_decision(cur, r["file_id"], group_id, "keeper", "keep-all")
|
|
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (group_id,))
|
|
con.commit()
|
|
con.close()
|
|
return {"success": True}
|
|
|
|
|
|
@app.post("/api/groups/{group_id}/unreview")
|
|
def unreview_group(group_id: int):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("SELECT id FROM duplicate_groups WHERE id=?", (group_id,))
|
|
if not cur.fetchone():
|
|
raise HTTPException(404, "Group not found")
|
|
cur.execute("SELECT file_id FROM duplicate_members WHERE group_id=?", (group_id,))
|
|
for r in cur.fetchall():
|
|
cur.execute(
|
|
"UPDATE duplicate_members SET is_keeper=0 WHERE group_id=? AND file_id=?",
|
|
(group_id, r["file_id"]),
|
|
)
|
|
cur.execute("UPDATE files SET status='pending' WHERE id=?", (r["file_id"],))
|
|
sc.log_decision(cur, r["file_id"], group_id, "unreview", "manual")
|
|
cur.execute("UPDATE duplicate_groups SET reviewed=0 WHERE id=?", (group_id,))
|
|
con.commit()
|
|
con.close()
|
|
return {"success": True}
|
|
|
|
|
|
@app.post("/api/groups/auto-resolve-exact")
|
|
def auto_resolve_exact():
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("""
|
|
SELECT id FROM duplicate_groups
|
|
WHERE method='sha256' AND reviewed=0
|
|
""")
|
|
groups = [r["id"] for r in cur.fetchall()]
|
|
resolved = 0
|
|
|
|
for gid in groups:
|
|
cur.execute("""
|
|
SELECT f.id, f.path, f.width, f.height, f.file_size,
|
|
f.exif_datetime, f.file_mtime
|
|
FROM duplicate_members dm
|
|
JOIN files f ON f.id = dm.file_id
|
|
WHERE dm.group_id = ?
|
|
""", (gid,))
|
|
members = [dict(r) for r in cur.fetchall()]
|
|
if not members:
|
|
continue
|
|
|
|
keeper_id = sc._suggested_keeper_by_resolution(members)
|
|
for m in members:
|
|
is_k = 1 if m["id"] == keeper_id else 0
|
|
cur.execute(
|
|
"UPDATE duplicate_members SET is_keeper=? WHERE group_id=? AND file_id=?",
|
|
(is_k, gid, m["id"]),
|
|
)
|
|
cur.execute(
|
|
"UPDATE files SET status=? WHERE id=?",
|
|
("keeper" if is_k else "redundant", m["id"]),
|
|
)
|
|
sc.log_decision(
|
|
cur, m["id"], gid,
|
|
"keeper" if is_k else "redundant",
|
|
"auto-resolve-exact",
|
|
)
|
|
cur.execute("UPDATE duplicate_groups SET reviewed=1 WHERE id=?", (gid,))
|
|
resolved += 1
|
|
|
|
con.commit()
|
|
con.close()
|
|
return {"resolved": resolved}
|
|
|
|
|
|
# ── Files + thumbnails ────────────────────────────────────────────────────────
|
|
|
|
VIDEO_PLACEHOLDER_SVG = """<svg xmlns="http://www.w3.org/2000/svg" width="200" height="200"
|
|
viewBox="0 0 200 200">
|
|
<rect width="200" height="200" fill="#1e1e2e"/>
|
|
<polygon points="75,55 75,145 145,100" fill="#9b7de8"/>
|
|
</svg>"""
|
|
|
|
VIDEO_EXT = {".mp4", ".mov", ".avi", ".mkv", ".m4v", ".3gp", ".wmv", ".mts", ".m2ts"}
|
|
|
|
|
|
@app.get("/api/thumb/{file_id}")
|
|
def get_thumb(file_id: int):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("SELECT path, mime_type, extension FROM files WHERE id=?", (file_id,))
|
|
row = cur.fetchone()
|
|
con.close()
|
|
|
|
if not row:
|
|
raise HTTPException(404, "File not found")
|
|
|
|
path = row["path"]
|
|
ext = (row["extension"] or "").lower()
|
|
|
|
if not os.path.isfile(path):
|
|
raise HTTPException(404, "File not on disk")
|
|
|
|
if ext in VIDEO_EXT:
|
|
# Try ffmpeg for first frame
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
"ffmpeg", "-i", path,
|
|
"-vframes", "1", "-f", "image2", "-vcodec", "mjpeg",
|
|
"pipe:1",
|
|
],
|
|
capture_output=True, timeout=10,
|
|
)
|
|
if result.returncode == 0 and result.stdout:
|
|
return Response(content=result.stdout, media_type="image/jpeg")
|
|
except Exception:
|
|
pass
|
|
return Response(content=VIDEO_PLACEHOLDER_SVG, media_type="image/svg+xml")
|
|
|
|
# Serve photo directly
|
|
mime = row["mime_type"] or "application/octet-stream"
|
|
return FileResponse(path, media_type=mime)
|
|
|
|
|
|
@app.get("/api/files/{file_id}")
|
|
def get_file_meta(file_id: int):
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("SELECT * FROM files WHERE id=?", (file_id,))
|
|
row = cur.fetchone()
|
|
con.close()
|
|
if not row:
|
|
raise HTTPException(404, "File not found")
|
|
return dict(row)
|
|
|
|
|
|
# ── Stats ─────────────────────────────────────────────────────────────────────
|
|
|
|
@app.get("/api/browse")
|
|
def browse(path: str = Query("/")):
|
|
"""List subdirectories at the given path for the folder picker."""
|
|
try:
|
|
p = Path(path).resolve()
|
|
except Exception:
|
|
raise HTTPException(400, "Invalid path")
|
|
if not p.exists() or not p.is_dir():
|
|
raise HTTPException(404, "Path not found")
|
|
|
|
dirs = []
|
|
try:
|
|
for entry in sorted(p.iterdir()):
|
|
if entry.is_dir() and not entry.name.startswith("."):
|
|
dirs.append(entry.name)
|
|
except PermissionError:
|
|
pass
|
|
|
|
parent = str(p.parent) if p != p.parent else None
|
|
return {
|
|
"current": str(p),
|
|
"parent": parent,
|
|
"dirs": dirs,
|
|
}
|
|
|
|
|
|
@app.get("/api/stats")
|
|
def get_stats():
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
|
|
cur.execute("SELECT COUNT(*), SUM(file_size) FROM files WHERE status != 'error'")
|
|
r = cur.fetchone()
|
|
total_files = r[0] or 0
|
|
total_size = r[1] or 0
|
|
|
|
cur.execute("""
|
|
SELECT COUNT(*), SUM(f.file_size)
|
|
FROM files f
|
|
JOIN duplicate_members dm ON dm.file_id = f.id
|
|
WHERE dm.is_keeper = 0
|
|
""")
|
|
r = cur.fetchone()
|
|
dup_files = r[0] or 0
|
|
dup_size = r[1] or 0
|
|
|
|
by_method = {}
|
|
for method in ("sha256", "phash", "exif", "filesize"):
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE method=?", (method,))
|
|
groups = cur.fetchone()[0]
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM duplicate_members dm
|
|
JOIN duplicate_groups dg ON dg.id = dm.group_id
|
|
WHERE dg.method = ?
|
|
""", (method,))
|
|
files = cur.fetchone()[0]
|
|
by_method[method] = {"groups": groups, "files": files}
|
|
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=1")
|
|
reviewed = cur.fetchone()[0]
|
|
cur.execute("SELECT COUNT(*) FROM duplicate_groups WHERE reviewed=0")
|
|
pending = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM files WHERE is_takeout=1")
|
|
takeout_files = cur.fetchone()[0]
|
|
|
|
con.close()
|
|
return {
|
|
"total_files": total_files,
|
|
"total_size_bytes": total_size,
|
|
"duplicate_files": dup_files,
|
|
"duplicate_size_bytes": dup_size,
|
|
"groups_by_method": by_method,
|
|
"reviewed": reviewed,
|
|
"pending": pending,
|
|
"takeout_files": takeout_files,
|
|
}
|
|
|
|
|
|
# ── Export ────────────────────────────────────────────────────────────────────
|
|
|
|
@app.get("/api/export/csv")
|
|
def export_csv():
|
|
con = get_db()
|
|
cur = con.cursor()
|
|
cur.execute("""
|
|
SELECT dg.id as group_id, dg.method, f.id as file_id,
|
|
f.path, f.filename, f.file_size,
|
|
f.width, f.height, f.exif_datetime, f.exif_device,
|
|
dm.is_keeper,
|
|
CASE WHEN dm.is_keeper=0 AND dg.reviewed=1 THEN 1 ELSE 0 END as is_redundant,
|
|
dg.reviewed
|
|
FROM duplicate_groups dg
|
|
JOIN duplicate_members dm ON dm.group_id = dg.id
|
|
JOIN files f ON f.id = dm.file_id
|
|
ORDER BY dg.id, dm.is_keeper DESC
|
|
""")
|
|
rows = cur.fetchall()
|
|
con.close()
|
|
|
|
output = io.StringIO()
|
|
# QUOTE_ALL + explicit lineterminator handles paths/filenames containing
|
|
# embedded \r, \n, quotes, or NULs — which the default dialect refuses.
|
|
writer = csv.writer(output, quoting=csv.QUOTE_ALL, lineterminator="\n")
|
|
writer.writerow([
|
|
"group_id", "method", "file_id", "path", "filename",
|
|
"size", "width", "height", "exif_date", "device",
|
|
"is_keeper", "is_redundant", "reviewed",
|
|
])
|
|
|
|
def _clean(v):
|
|
# Strip NULs (csv writer rejects them) and normalise embedded line breaks
|
|
if isinstance(v, str):
|
|
return v.replace("\x00", "").replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
|
|
return v
|
|
|
|
for r in rows:
|
|
writer.writerow([
|
|
r["group_id"], r["method"], r["file_id"],
|
|
_clean(r["path"]), _clean(r["filename"]), r["file_size"],
|
|
r["width"], r["height"], _clean(r["exif_datetime"]),
|
|
_clean(r["exif_device"]),
|
|
r["is_keeper"], r["is_redundant"], r["reviewed"],
|
|
])
|
|
|
|
output.seek(0)
|
|
return StreamingResponse(
|
|
iter([output.getvalue()]),
|
|
media_type="text/csv",
|
|
headers={"Content-Disposition": "attachment; filename=dup-finder-export.csv"},
|
|
)
|