diff --git a/app/main.py b/app/main.py index c3d577d..370be2d 100644 --- a/app/main.py +++ b/app/main.py @@ -105,11 +105,14 @@ def scan_start(body: ScanStartBody): sc.scan_state.update( scan_id=scan_id, status="running", - phase="discovery", + phase="takeout", progress=0, total=0, message="Starting...", - cancel_requested=False, + pause_requested=False, + files_indexed=0, + phashes_done=0, + folder_path=body.folder_path, stats={}, ) @@ -146,24 +149,76 @@ def scan_status(): con.close() return { - "scan_id": state["scan_id"], - "status": state["status"], - "phase": state["phase"], - "progress": state["progress"], - "total": state["total"], - "message": state["message"], - "stats": stats, + "scan_id": state["scan_id"], + "status": state["status"], + "phase": state["phase"], + "progress": state["progress"], + "total": state["total"], + "message": state["message"], + "folder_path": state.get("folder_path"), + "files_indexed": state.get("files_indexed", 0), + "phashes_done": state.get("phashes_done", 0), + "stats": stats, } -@app.post("/api/scan/cancel") -def scan_cancel(): +@app.post("/api/scan/pause") +def scan_pause(): if sc.scan_state["status"] != "running": raise HTTPException(400, "No scan is currently running") - sc.scan_state["cancel_requested"] = True + sc.scan_state["pause_requested"] = True return {"success": True} +# Keep /cancel as an alias so any lingering clients still work +@app.post("/api/scan/cancel") +def scan_cancel(): + return scan_pause() + + +@app.post("/api/scan/resume") +def scan_resume(): + if sc.scan_state["status"] != "paused": + raise HTTPException(400, "No paused scan to resume") + + folder_path = sc.scan_state.get("folder_path") + if not folder_path: + raise HTTPException(400, "No folder path saved — please start a new scan") + + con = get_db() + cur = con.cursor() + cur.execute( + "INSERT INTO scans (folder_path, status) VALUES (?, 'running')", + (folder_path,), + ) + scan_id = cur.lastrowid + con.commit() + con.close() + + sc.scan_state.update( + scan_id=scan_id, + status="running", + phase="takeout", + progress=0, + total=0, + message="Resuming scan...", + pause_requested=False, + files_indexed=0, + phashes_done=0, + folder_path=folder_path, + stats={}, + ) + + thread = threading.Thread( + target=sc.run_scan, + args=(folder_path, scan_id, "incremental"), + daemon=True, + ) + thread.start() + + return {"scan_id": scan_id} + + @app.delete("/api/scan/reset") def scan_reset(confirm: str = Query("")): if confirm != "RESET": @@ -178,7 +233,9 @@ def scan_reset(confirm: str = Query("")): con.close() sc.scan_state.update( scan_id=None, status="idle", phase="idle", - progress=0, total=0, message="", stats={}, + progress=0, total=0, message="", + pause_requested=False, files_indexed=0, + phashes_done=0, folder_path=None, stats={}, ) return {"success": True} diff --git a/app/scanner.py b/app/scanner.py index 34b487b..d2d5093 100644 --- a/app/scanner.py +++ b/app/scanner.py @@ -44,14 +44,17 @@ DB_PATH = str(_DATA_DIR / "dupfinder.db") # Shared scan state (updated by background thread, read by status endpoint) scan_state = { - "scan_id": None, - "status": "idle", # idle | running | complete | error | cancelled - "phase": "idle", # discovery | takeout | indexing | phash | grouping | done - "progress": 0, - "total": 0, - "message": "", - "cancel_requested": False, - "stats": {}, + "scan_id": None, + "status": "idle", # idle|running|paused|complete|error + "phase": "idle", # takeout|indexing|phash|grouping|done + "progress": 0, + "total": 0, + "message": "", + "folder_path": None, # persists so resume knows where to continue + "pause_requested": False, + "files_indexed": 0, # cumulative across phases + "phashes_done": 0, + "stats": {}, } @@ -92,12 +95,15 @@ def init_db(): ); CREATE TABLE IF NOT EXISTS scans ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - folder_path TEXT NOT NULL, - started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - completed_at TIMESTAMP, - total_files INTEGER DEFAULT 0, - status TEXT DEFAULT 'running' + id INTEGER PRIMARY KEY AUTOINCREMENT, + folder_path TEXT NOT NULL, + started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + completed_at TIMESTAMP, + total_files INTEGER DEFAULT 0, + files_indexed INTEGER DEFAULT 0, + phashes_done INTEGER DEFAULT 0, + last_phase TEXT DEFAULT 'indexing', + status TEXT DEFAULT 'running' ); CREATE TABLE IF NOT EXISTS duplicate_groups ( @@ -122,7 +128,48 @@ def init_db(): CREATE INDEX IF NOT EXISTS idx_size_dim ON files(file_size, width, height); CREATE INDEX IF NOT EXISTS idx_status ON files(status); """) + # Migration: add new columns to scans if upgrading from older schema + for col, defn in [ + ("files_indexed", "INTEGER DEFAULT 0"), + ("phashes_done", "INTEGER DEFAULT 0"), + ("last_phase", "TEXT DEFAULT 'indexing'"), + ]: + try: + cur.execute(f"ALTER TABLE scans ADD COLUMN {col} {defn}") + except Exception: + pass # column already exists con.commit() + + # ── Detect interrupted scans from previous run ──────────────────────────── + # Any scan left as 'running' means the server was killed mid-scan. + # Mark them 'paused' so the UI offers a resume button. + cur.execute(""" + UPDATE scans SET status = 'paused' + WHERE status = 'running' + """) + con.commit() + + # Restore scan_state if there's a paused scan + cur.execute(""" + SELECT id, folder_path, files_indexed, phashes_done, last_phase + FROM scans WHERE status = 'paused' + ORDER BY started_at DESC LIMIT 1 + """) + row = cur.fetchone() + if row: + scan_state.update( + scan_id=row["id"], + status="paused", + phase=row["last_phase"] or "indexing", + folder_path=row["folder_path"], + files_indexed=row["files_indexed"] or 0, + phashes_done=row["phashes_done"] or 0, + message=( + f"Paused — {row['files_indexed']:,} files indexed, " + f"{row['phashes_done']:,} phashes done" + ), + ) + con.close() @@ -473,11 +520,27 @@ def _run_filesize_pass(con: sqlite3.Connection, scan_id: int): ) +# ── Pause helpers ──────────────────────────────────────────────────────────── + +def _save_pause_state(cur, scan_id: int, phase: str, + files_indexed: int, phashes_done: int): + """Persist pause progress so the scan survives a server restart.""" + cur.execute(""" + UPDATE scans SET + status = 'paused', + last_phase = ?, + files_indexed = ?, + phashes_done = ? + WHERE id = ? + """, (phase, files_indexed, phashes_done, scan_id)) + + # ── Main scan entry point ───────────────────────────────────────────────────── def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): """Main scan function — runs in background thread.""" global scan_state + scan_state["folder_path"] = folder_path # persist so resume knows where to continue con = get_db() cur = con.cursor() @@ -498,10 +561,13 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): else: scan_state["message"] = "Not a Takeout folder — skipping" - if scan_state["cancel_requested"]: - _mark_scan(cur, scan_id, "cancelled") + if scan_state["pause_requested"]: + _save_pause_state(cur, scan_id, "takeout", 0, 0) con.commit() - scan_state["status"] = "cancelled" + scan_state.update( + status="paused", pause_requested=False, + message="Paused during Takeout check", + ) return # ── Phases: discovery + indexing (pipelined) ────────────────────── @@ -530,6 +596,7 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): _discovered = [0] # total files found by walker so far _done = [0] # files fully indexed (skipped + processed) _walk_done = [False] + _pause_at_end = False # set True when pause requested mid-walk all_files: list[str] = [] to_skip: list[str] = [] changed_ids: list[int] = [] @@ -608,12 +675,9 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): for root, dirs, files in os.walk(folder_path): dirs[:] = [d for d in dirs if not d.startswith(".")] - if scan_state["cancel_requested"]: - pool.shutdown(wait=False, cancel_futures=True) - _mark_scan(cur, scan_id, "cancelled") - con.commit() - scan_state["status"] = "cancelled" - return + if scan_state["pause_requested"]: + _pause_at_end = True + break # stop walking; in-flight futures drain normally for fname in files: if fname.endswith(".json"): @@ -671,12 +735,6 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): # ── Wait for remaining futures ──────────────────────────────── scan_state["total"] = len(all_files) for future in as_completed(pending): - if scan_state["cancel_requested"]: - pool.shutdown(wait=False, cancel_futures=True) - _mark_scan(cur, scan_id, "cancelled") - con.commit() - scan_state["status"] = "cancelled" - return path, existing = pending[future] _write_result(path, future.result(), existing) with _lock: @@ -691,6 +749,17 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): con.commit() + # ── Pause checkpoint: after indexing ────────────────────────────── + scan_state["files_indexed"] = _done[0] + if _pause_at_end: + _save_pause_state(cur, scan_id, "indexing", _done[0], 0) + con.commit() + scan_state.update( + status="paused", pause_requested=False, + message=f"Paused — {_done[0]:,} files indexed", + ) + return + # ── Phase: phash ────────────────────────────────────────────────── phasher = get_phasher() hw_label = "GPU" if phasher.using_gpu else "CPU" @@ -709,29 +778,34 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): scan_state["total"] = len(photo_rows) if photo_rows: - # Build id lookup so we can write results back efficiently path_to_id = {row["path"]: row["id"] for row in photo_rows} all_paths = list(path_to_id.keys()) - def _phash_progress(n_done: int): - if scan_state["cancel_requested"]: - return - scan_state["progress"] = n_done - scan_state["message"] = ( - f"Phash ({hw_label}): {n_done:,} / {len(all_paths):,}" - ) + # Process in chunks so pause requests are honoured between batches + PHASH_CHUNK = 500 + phashes_written = 0 - results = phasher.hash_files(all_paths, progress_cb=_phash_progress) - - # Bulk write to DB in chunks of 500 - items = list(results.items()) - for chunk_start in range(0, len(items), 500): - if scan_state["cancel_requested"]: - _mark_scan(cur, scan_id, "cancelled") + for chunk_start in range(0, len(all_paths), PHASH_CHUNK): + if scan_state["pause_requested"]: + _save_pause_state( + cur, scan_id, "phash", + scan_state["files_indexed"], phashes_written, + ) con.commit() - scan_state["status"] = "cancelled" + scan_state.update( + status="paused", pause_requested=False, + phashes_done=phashes_written, + message=( + f"Paused — {phashes_written:,} / {len(all_paths):,} " + "perceptual hashes computed" + ), + ) return - for path, ph in items[chunk_start : chunk_start + 500]: + + chunk = all_paths[chunk_start : chunk_start + PHASH_CHUNK] + chunk_results = phasher.hash_files(chunk, progress_cb=None) + + for path, ph in chunk_results.items(): fid = path_to_id.get(path) if fid and ph: cur.execute( @@ -739,6 +813,13 @@ def run_scan(folder_path: str, scan_id: int, mode: str = "incremental"): ) con.commit() + phashes_written += len(chunk) + scan_state["phashes_done"] = phashes_written + scan_state["progress"] = phashes_written + scan_state["message"] = ( + f"Phash ({hw_label}): {phashes_written:,} / {len(all_paths):,}" + ) + con.commit() # ── Phase: grouping ─────────────────────────────────────────────── diff --git a/templates/index.html b/templates/index.html index 894eb99..55aedb8 100644 --- a/templates/index.html +++ b/templates/index.html @@ -61,6 +61,7 @@ #scan-chip.complete { border-color: var(--success); color: var(--success); } #scan-chip.error { border-color: var(--danger); color: var(--danger); } #scan-chip.cancelled { border-color: var(--warning); color: var(--warning); } + #scan-chip.paused { border-color: var(--warning); color: var(--warning); } #topbar-stats { margin-left: auto; display: flex; gap: 20px; font-size: 12px; color: var(--text-dim); } #topbar-stats span b { color: var(--text); } @@ -242,6 +243,20 @@ /* ── Rescan buttons ── */ #rescan-area { display: none; margin-top: 16px; } #rescan-area.show { display: block; } + + #paused-area { display: none; margin-top: 16px; } + #paused-area.show { display: block; } + .pause-banner { + display: flex; align-items: flex-start; gap: 12px; + background: rgba(226,164,58,.1); + border: 1px solid rgba(226,164,58,.35); + border-radius: var(--radius); + padding: 12px 14px; + margin-bottom: 10px; + } + .pause-icon { font-size: 22px; line-height: 1; } + .pause-title { font-weight: 600; color: var(--warning); margin-bottom: 4px; } + .pause-details { font-size: 12px; color: var(--text-dim); line-height: 1.6; } .rescan-info { font-size: 12px; color: var(--text-dim); margin-bottom: 10px; } .rescan-buttons { display: flex; @@ -765,7 +780,21 @@ Grouping
- + +
+ + +
+
+
▮▮
+
+
Scan paused
+
+
+
+
+ +
@@ -1058,9 +1087,11 @@ function updateScanUI(s) { chip.classList.add(s.status); const isRunning = s.status === 'running'; + const isPaused = s.status === 'paused'; el('progress-area').classList.toggle('show', isRunning); - el('first-scan-ui').style.display = (s.scan_id || isRunning) ? 'none' : ''; - el('rescan-area').classList.toggle('show', !isRunning && !!s.scan_id); + el('paused-area').classList.toggle('show', isPaused); + el('first-scan-ui').style.display = (s.scan_id || isRunning || isPaused) ? 'none' : ''; + el('rescan-area').classList.toggle('show', !isRunning && !isPaused && !!s.scan_id); if (isRunning) { el('progress-msg').textContent = s.message || ''; @@ -1081,7 +1112,16 @@ function updateScanUI(s) { }); } - if (s.scan_id && !isRunning) { + if (isPaused) { + const parts = []; + if (s.folder_path) parts.push(`Folder: ${s.folder_path}`); + if (s.files_indexed) parts.push(`${fmt(s.files_indexed)} files indexed`); + if (s.phashes_done) parts.push(`${fmt(s.phashes_done)} phashes computed`); + if (s.message) parts.push(s.message); + el('pause-details').textContent = parts.join(' · ') || 'Progress saved'; + } + + if (s.scan_id && !isRunning && !isPaused) { // populate rescan folder from last scan el('rescan-folder-input').value = el('folder-input').value || '/photos'; } @@ -1114,11 +1154,20 @@ async function startScan(mode) { } } -async function cancelScan() { +async function pauseScan() { try { - await api('POST', '/api/scan/cancel'); - showToast('Cancelling scan...'); - } catch(e) {} + await api('POST', '/api/scan/pause'); + showToast('Pausing scan — finishing in-flight work...'); + } catch(e) { showToast('Error: ' + e.message, 3000); } +} + +async function resumeScan() { + try { + await api('POST', '/api/scan/resume'); + state.scanStatus = 'running'; + showToast('Resuming scan...'); + startPoller(); + } catch(e) { showToast('Error: ' + e.message, 4000); } } function confirmFullReset() { @@ -1548,6 +1597,7 @@ async function init() { try { const s = await api('GET', '/api/scan/status'); updateScanUI(s); + state.scanStatus = s.status; if (s.status === 'running') startPoller(); } catch(e) {} }