GPU-accelerated phash + fix discovery/takeout hang
GPU: - Switch Dockerfile base to pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime - Add gpu_hasher.py: batched 2D DCT on GPU via PyTorch matrix multiply, 256 images/batch, produces imagehash-compatible 64-bit hex hashes, auto-falls back to CPU when CUDA unavailable - Replace per-image phash loop in scanner.py with phasher.hash_files() - docker-compose.yml: add nvidia GPU device reservation Hang fix: - takeout.is_takeout_folder() now caps at 50 directories (was walking entire tree — blocked for minutes on 65k+ file libraries) - Add "Not a Takeout folder" status message so takeout phase is never silent Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,7 @@ except ImportError:
|
||||
pass
|
||||
|
||||
from takeout import is_takeout_folder, process_takeout
|
||||
from gpu_hasher import get_phasher
|
||||
|
||||
|
||||
PHOTO_EXT = {
|
||||
@@ -516,10 +517,14 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
con.commit()
|
||||
|
||||
# ── Phase: takeout pre-processing ─────────────────────────────────
|
||||
scan_state.update(phase="takeout", message="Checking for Google Takeout structure...")
|
||||
# Detection samples ≤50 dirs so it never blocks on large libraries
|
||||
scan_state.update(phase="takeout",
|
||||
message="Checking for Google Takeout structure (sampling)...")
|
||||
if is_takeout_folder(folder_path):
|
||||
scan_state["message"] = "Processing Google Takeout sidecars..."
|
||||
process_takeout(folder_path, DB_PATH)
|
||||
else:
|
||||
scan_state["message"] = "Not a Takeout folder — skipping"
|
||||
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
@@ -607,8 +612,10 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
con.commit()
|
||||
|
||||
# ── Phase: phash ──────────────────────────────────────────────────
|
||||
phasher = get_phasher()
|
||||
hw_label = "GPU" if phasher.using_gpu else "CPU"
|
||||
scan_state.update(phase="phash", progress=0,
|
||||
message="Computing perceptual hashes...")
|
||||
message=f"Computing perceptual hashes ({hw_label})...")
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, path FROM files
|
||||
@@ -621,19 +628,35 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"):
|
||||
photo_rows = cur.fetchall()
|
||||
scan_state["total"] = len(photo_rows)
|
||||
|
||||
for i, row in enumerate(photo_rows):
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
if photo_rows:
|
||||
# Build id lookup so we can write results back efficiently
|
||||
path_to_id = {row["path"]: row["id"] for row in photo_rows}
|
||||
all_paths = list(path_to_id.keys())
|
||||
|
||||
scan_state["progress"] = i + 1
|
||||
scan_state["message"] = f"Phash: {Path(row['path']).name}"
|
||||
ph = _phash(row["path"])
|
||||
if ph:
|
||||
cur.execute("UPDATE files SET phash=? WHERE id=?", (ph, row["id"]))
|
||||
if (i + 1) % 200 == 0:
|
||||
def _phash_progress(n_done: int):
|
||||
if scan_state["cancel_requested"]:
|
||||
return
|
||||
scan_state["progress"] = n_done
|
||||
scan_state["message"] = (
|
||||
f"Phash ({hw_label}): {n_done:,} / {len(all_paths):,}"
|
||||
)
|
||||
|
||||
results = phasher.hash_files(all_paths, progress_cb=_phash_progress)
|
||||
|
||||
# Bulk write to DB in chunks of 500
|
||||
items = list(results.items())
|
||||
for chunk_start in range(0, len(items), 500):
|
||||
if scan_state["cancel_requested"]:
|
||||
_mark_scan(cur, scan_id, "cancelled")
|
||||
con.commit()
|
||||
scan_state["status"] = "cancelled"
|
||||
return
|
||||
for path, ph in items[chunk_start : chunk_start + 500]:
|
||||
fid = path_to_id.get(path)
|
||||
if fid and ph:
|
||||
cur.execute(
|
||||
"UPDATE files SET phash=? WHERE id=?", (ph, fid)
|
||||
)
|
||||
con.commit()
|
||||
|
||||
con.commit()
|
||||
|
||||
Reference in New Issue
Block a user