fix: use absolute /app/data/raw path so data survives container exit

../data/raw relative to WORKDIR /app resolves to /data/raw which is
outside the volume mount (./data:/app/data). The scraper wrote 583
pages to the ephemeral container filesystem and they were lost on exit.
Changed to /app/data/raw in both scraper and indexer so the JSONL file
lands inside the mounted volume and persists between containers.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-14 21:39:10 -04:00
parent bc054cd478
commit 4b85f76947
2 changed files with 2 additions and 2 deletions

View File

@@ -39,7 +39,7 @@ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs") COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs")
VECTOR_SIZE = 768 VECTOR_SIZE = 768
RAW_DATA_FILE = Path("../data/raw/odoo18_docs_raw.jsonl") RAW_DATA_FILE = Path("/app/data/raw/odoo18_docs_raw.jsonl")
BATCH_SIZE = 32 BATCH_SIZE = 32
CHUNK_SIZE = 512 CHUNK_SIZE = 512
CHUNK_OVERLAP = 64 CHUNK_OVERLAP = 64

View File

@@ -31,7 +31,7 @@ log = logging.getLogger(__name__)
BASE_URL = "https://www.odoo.com/documentation/18.0" BASE_URL = "https://www.odoo.com/documentation/18.0"
SITEMAP_URL = f"{BASE_URL}/sitemap.xml" SITEMAP_URL = f"{BASE_URL}/sitemap.xml"
OUTPUT_DIR = Path("../data/raw") OUTPUT_DIR = Path("/app/data/raw")
OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl" OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl"
DELAY_SECONDS = 1.2 DELAY_SECONDS = 1.2
MAX_RETRIES = 3 MAX_RETRIES = 3