GPU-accelerated phash + fix discovery/takeout hang

GPU:
- Switch Dockerfile base to pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
- Add gpu_hasher.py: batched 2D DCT on GPU via PyTorch matrix multiply,
  256 images/batch, produces imagehash-compatible 64-bit hex hashes,
  auto-falls back to CPU when CUDA unavailable
- Replace per-image phash loop in scanner.py with phasher.hash_files()
- docker-compose.yml: add nvidia GPU device reservation

Hang fix:
- takeout.is_takeout_folder() now caps at 50 directories (was walking
  entire tree — blocked for minutes on 65k+ file libraries)
- Add "Not a Takeout folder" status message so takeout phase is never silent

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
tocmo
2026-04-05 01:37:28 -04:00
parent 1d46b9945d
commit c110a8e4f9
6 changed files with 222 additions and 20 deletions

View File

@@ -20,6 +20,7 @@ except ImportError:
pass
from takeout import is_takeout_folder, process_takeout
from gpu_hasher import get_phasher
PHOTO_EXT = {
@@ -516,10 +517,14 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
con.commit()
# ── Phase: takeout pre-processing ─────────────────────────────────
scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
# Detection samples ≤50 dirs so it never blocks on large libraries
scan_state.update(phase="takeout",
message="Checking for Google Takeout structure (sampling)...")
if is_takeout_folder(folder_path):
scan_state["message"] = "Processing Google Takeout sidecars..."
process_takeout(folder_path, DB_PATH)
else:
scan_state["message"] = "Not a Takeout folder — skipping"
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
@@ -607,8 +612,10 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
con.commit()
# ── Phase: phash ──────────────────────────────────────────────────
phasher = get_phasher()
hw_label = "GPU" if phasher.using_gpu else "CPU"
scan_state.update(phase="phash", progress=0,
message="Computing perceptual hashes...")
message=f"Computing perceptual hashes ({hw_label})...")
cur.execute("""
SELECT id, path FROM files
@@ -621,19 +628,35 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
photo_rows = cur.fetchall()
scan_state["total"] = len(photo_rows)
for i, row in enumerate(photo_rows):
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
con.commit()
scan_state["status"] = "cancelled"
return
if photo_rows:
# Build id lookup so we can write results back efficiently
path_to_id = {row["path"]: row["id"] for row in photo_rows}
all_paths = list(path_to_id.keys())
scan_state["progress"] = i + 1
scan_state["message"] = f"Phash: {Path(row['path']).name}"
ph = _phash(row["path"])
if ph:
cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
if (i + 1) % 200 == 0:
def _phash_progress(n_done: int):
if scan_state["cancel_requested"]:
return
scan_state["progress"] = n_done
scan_state["message"] = (
f"Phash ({hw_label}): {n_done:,} / {len(all_paths):,}"
)
results = phasher.hash_files(all_paths, progress_cb=_phash_progress)
# Bulk write to DB in chunks of 500
items = list(results.items())
for chunk_start in range(0, len(items), 500):
if scan_state["cancel_requested"]:
_mark_scan(cur, scan_id, "cancelled")
con.commit()
scan_state["status"] = "cancelled"
return
for path, ph in items[chunk_start : chunk_start + 500]:
fid = path_to_id.get(path)
if fid and ph:
cur.execute(
"UPDATE files SET phash=? WHERE id=?", (ph, fid)
)
con.commit()
con.commit()