GPU-accelerated phash + fix discovery/takeout hang

GPU: - Switch Dockerfile base to pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime - Add gpu_hasher.py: batched 2D DCT on GPU via PyTorch matrix multiply, 256 images/batch, produces imagehash-compatible 64-bit hex hashes, auto-falls back to CPU when CUDA unavailable - Replace per-image phash loop in scanner.py with phasher.hash_files() - docker-compose.yml: add nvidia GPU device reservation Hang fix: - takeout.is_takeout_folder() now caps at 50 directories (was walking entire tree — blocked for minutes on 65k+ file libraries) - Add "Not a Takeout folder" status message so takeout phase is never silent Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 01:37:28 -04:00
parent 1d46b9945d
commit c110a8e4f9
6 changed files with 222 additions and 20 deletions
--- a/app/scanner.py
+++ b/app/scanner.py
@@ -20,6 +20,7 @@ except ImportError:
    pass

 from takeout import is_takeout_folder, process_takeout
+from gpu_hasher import get_phasher


 PHOTO_EXT = {
@@ -516,10 +517,14 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
            con.commit()

        # ── Phase: takeout pre-processing ─────────────────────────────────
-        scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
+        # Detection samples ≤50 dirs so it never blocks on large libraries
+        scan_state.update(phase="takeout",
+                          message="Checking for Google Takeout structure (sampling)...")
        if is_takeout_folder(folder_path):
            scan_state["message"] = "Processing Google Takeout sidecars..."
            process_takeout(folder_path, DB_PATH)
+        else:
+            scan_state["message"] = "Not a Takeout folder — skipping"

        if scan_state["cancel_requested"]:
            _mark_scan(cur, scan_id, "cancelled")
@@ -607,8 +612,10 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
        con.commit()

        # ── Phase: phash ──────────────────────────────────────────────────
+        phasher = get_phasher()
+        hw_label = "GPU" if phasher.using_gpu else "CPU"
        scan_state.update(phase="phash", progress=0,
-                          message="Computing perceptual hashes...")
+                          message=f"Computing perceptual hashes ({hw_label})...")

        cur.execute("""
            SELECT id, path FROM files
@@ -621,19 +628,35 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
        photo_rows = cur.fetchall()
        scan_state["total"] = len(photo_rows)

-        for i, row in enumerate(photo_rows):
-            if scan_state["cancel_requested"]:
-                _mark_scan(cur, scan_id, "cancelled")
-                con.commit()
-                scan_state["status"] = "cancelled"
-                return
+        if photo_rows:
+            # Build id lookup so we can write results back efficiently
+            path_to_id = {row["path"]: row["id"] for row in photo_rows}
+            all_paths  = list(path_to_id.keys())

-            scan_state["progress"] = i + 1
-            scan_state["message"] = f"Phash: {Path(row['path']).name}"
-            ph = _phash(row["path"])
-            if ph:
-                cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
-            if (i + 1) % 200 == 0:
+            def _phash_progress(n_done: int):
+                if scan_state["cancel_requested"]:
+                    return
+                scan_state["progress"] = n_done
+                scan_state["message"]  = (
+                    f"Phash ({hw_label}): {n_done:,} / {len(all_paths):,}"
+                )
+
+            results = phasher.hash_files(all_paths, progress_cb=_phash_progress)
+
+            # Bulk write to DB in chunks of 500
+            items = list(results.items())
+            for chunk_start in range(0, len(items), 500):
+                if scan_state["cancel_requested"]:
+                    _mark_scan(cur, scan_id, "cancelled")
+                    con.commit()
+                    scan_state["status"] = "cancelled"
+                    return
+                for path, ph in items[chunk_start : chunk_start + 500]:
+                    fid = path_to_id.get(path)
+                    if fid and ph:
+                        cur.execute(
+                            "UPDATE files SET phash=? WHERE id=?", (ph, fid)
+                        )
                con.commit()

        con.commit()