diff --git a/docker-compose.yml b/docker-compose.yml index b12c17e..bd7de80 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,10 +29,11 @@ services: networks: - rag_net healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:6333/healthz"] - interval: 30s + test: ["CMD", "curl", "-sf", "http://localhost:6333/"] + interval: 15s timeout: 10s - retries: 3 + retries: 5 + start_period: 30s rag-api: build: . diff --git a/scraper/scraper.py b/scraper/scraper.py index 6f62881..d077538 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -39,9 +39,11 @@ REQUEST_TIMEOUT = 20 HEADERS = { "User-Agent": ( - "Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; " - "+https://activeblue.net)" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", } MODULE_PATHS = { @@ -238,7 +240,7 @@ def clean_text(soup: BeautifulSoup) -> tuple: return clean, headings -def fetch_page(url: str) -> DocPage | None: +def fetch_page(url: str, debug: bool = False) -> DocPage | None: for attempt in range(MAX_RETRIES): try: resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) @@ -253,7 +255,11 @@ def fetch_page(url: str) -> DocPage | None: title = re.sub(r"\s*—\s*Odoo.*", "", title).strip() text, headings = clean_text(soup) + if debug: + log.info(f" DEBUG title={title!r} text_len={len(text)} snippet={text[:120]!r}") if len(text) < 100: + if not debug: + log.warning(f" Too short ({len(text)} chars): {url}") return None return DocPage( @@ -275,7 +281,7 @@ def fetch_page(url: str) -> DocPage | None: return None -def crawl(module: str | None = None, limit: int | None = None): +def crawl(module: str | None = None, limit: int | None = None, debug: bool = False): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) try: @@ -296,7 +302,7 @@ def crawl(module: str | None = None, limit: int | None = None): with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for i, url in enumerate(urls, 1): log.info(f"[{i}/{len(urls)}] {url}") - page = fetch_page(url) + page = fetch_page(url, debug=debug) if page: f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n") written += 1 @@ -312,5 +318,6 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Odoo 18 docs scraper") parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}") parser.add_argument("--limit", type=int, help="Max pages (for testing)") + parser.add_argument("--debug", action="store_true", help="Print extracted text snippets per page") args = parser.parse_args() - crawl(module=args.module, limit=args.limit) + crawl(module=args.module, limit=args.limit, debug=args.debug)