GPU: - Switch Dockerfile base to pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime - Add gpu_hasher.py: batched 2D DCT on GPU via PyTorch matrix multiply, 256 images/batch, produces imagehash-compatible 64-bit hex hashes, auto-falls back to CPU when CUDA unavailable - Replace per-image phash loop in scanner.py with phasher.hash_files() - docker-compose.yml: add nvidia GPU device reservation Hang fix: - takeout.is_takeout_folder() now caps at 50 directories (was walking entire tree — blocked for minutes on 65k+ file libraries) - Add "Not a Takeout folder" status message so takeout phase is never silent Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
155 lines
4.6 KiB
Python
155 lines
4.6 KiB
Python
"""
|
|
Google Takeout pre-processor.
|
|
Detects Takeout folder structures, reads JSON sidecars, and enriches
|
|
the files table with corrected timestamps, normalized filenames, and
|
|
edit-version flags.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
# Google edit suffixes appended to filenames
|
|
EDIT_SUFFIXES = ("-edited", "-effects", "-smile", "-mix")
|
|
|
|
|
|
def _find_sidecar(media_path: str) -> str | None:
|
|
"""Return path to the JSON sidecar for a media file, or None."""
|
|
p = Path(media_path)
|
|
# Try filename.ext.json first, then filename.json
|
|
candidates = [
|
|
str(p) + ".json",
|
|
str(p.with_suffix(".json")),
|
|
]
|
|
for c in candidates:
|
|
if os.path.isfile(c):
|
|
return c
|
|
return None
|
|
|
|
|
|
def _strip_collision_suffix(filename: str) -> str:
|
|
"""Strip Google's (1), (2) collision suffixes from a filename."""
|
|
stem = Path(filename).stem
|
|
ext = Path(filename).suffix
|
|
cleaned = re.sub(r"\(\d+\)$", "", stem).rstrip()
|
|
return cleaned + ext
|
|
|
|
|
|
def _is_edited(filename: str) -> bool:
|
|
stem = Path(filename).stem.lower()
|
|
return any(stem.endswith(s) for s in EDIT_SUFFIXES)
|
|
|
|
|
|
def is_takeout_folder(folder_path: str) -> bool:
|
|
"""
|
|
Heuristic: walk folder looking for .json files whose names match
|
|
adjacent media files. If we find at least 5 such pairs, call it Takeout.
|
|
"""
|
|
count = 0
|
|
dirs_checked = 0
|
|
MAX_DIRS = 50 # sample at most 50 directories — fast on any library size
|
|
|
|
for root, dirs, files in os.walk(folder_path):
|
|
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
|
dirs_checked += 1
|
|
if dirs_checked > MAX_DIRS:
|
|
break
|
|
|
|
file_set = set(files)
|
|
for f in files:
|
|
if not f.endswith(".json"):
|
|
continue
|
|
base = f[:-5] # strip .json
|
|
if base in file_set:
|
|
count += 1
|
|
if count >= 5:
|
|
return True
|
|
return False
|
|
|
|
|
|
def process_takeout(folder_path: str, db_path: str) -> int:
|
|
"""
|
|
Walk folder_path, find all media files with JSON sidecars,
|
|
and enrich their DB records. Returns count of files enriched.
|
|
"""
|
|
con = sqlite3.connect(db_path)
|
|
con.row_factory = sqlite3.Row
|
|
cur = con.cursor()
|
|
|
|
enriched = 0
|
|
|
|
for root, dirs, files in os.walk(folder_path):
|
|
dirs[:] = [d for d in dirs if not d.startswith(".")]
|
|
for fname in files:
|
|
if fname.endswith(".json"):
|
|
continue
|
|
media_path = os.path.join(root, fname)
|
|
sidecar = _find_sidecar(media_path)
|
|
if not sidecar:
|
|
continue
|
|
|
|
try:
|
|
with open(sidecar, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
continue
|
|
|
|
# Extract fields from sidecar
|
|
photo_taken_ts = None
|
|
try:
|
|
ts = int(data["photoTakenTime"]["timestamp"])
|
|
dt = datetime.fromtimestamp(ts, tz=timezone.utc)
|
|
photo_taken_ts = dt.strftime("%Y-%m-%dT%H:%M:%S")
|
|
except (KeyError, ValueError, TypeError):
|
|
pass
|
|
|
|
title = data.get("title", "")
|
|
takeout_json_str = json.dumps(data)
|
|
|
|
# Normalized filename: use title if present, else strip suffix from fname
|
|
if title:
|
|
normalized = _strip_collision_suffix(title)
|
|
else:
|
|
normalized = _strip_collision_suffix(fname)
|
|
|
|
edited = _is_edited(fname)
|
|
|
|
# Update the DB record for this file
|
|
updates = {
|
|
"is_takeout": 1,
|
|
"filename": normalized,
|
|
"takeout_json": takeout_json_str,
|
|
}
|
|
if photo_taken_ts:
|
|
updates["exif_datetime"] = photo_taken_ts
|
|
|
|
set_clause = ", ".join(f"{k} = ?" for k in updates)
|
|
values = list(updates.values()) + [media_path]
|
|
|
|
cur.execute(
|
|
f"UPDATE files SET {set_clause}, updated_at = CURRENT_TIMESTAMP "
|
|
f"WHERE path = ?",
|
|
values,
|
|
)
|
|
|
|
# Handle edited flag — add is_edited column if needed (migration-safe)
|
|
if edited:
|
|
try:
|
|
cur.execute(
|
|
"UPDATE files SET is_edited = 1 WHERE path = ?",
|
|
(media_path,),
|
|
)
|
|
except sqlite3.OperationalError:
|
|
pass # column doesn't exist yet, skip
|
|
|
|
if cur.rowcount > 0:
|
|
enriched += 1
|
|
|
|
con.commit()
|
|
con.close()
|
|
return enriched
|