fix: browser UA for scraper, Qdrant healthcheck endpoint
Scraper was using a bot User-Agent that triggered Cloudflare bot detection, returning challenge pages with < 100 chars of content. Switched to a standard Chrome UA with Accept headers. Qdrant healthcheck used /healthz which does not exist in v1.9.0. Changed to GET / which is always available. Added start_period: 30s so the check does not fire before Qdrant has time to initialise. Added --debug flag to scraper for future extraction diagnostics. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -29,10 +29,11 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- rag_net
|
- rag_net
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:6333/healthz"]
|
test: ["CMD", "curl", "-sf", "http://localhost:6333/"]
|
||||||
interval: 30s
|
interval: 15s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 5
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
rag-api:
|
rag-api:
|
||||||
build: .
|
build: .
|
||||||
|
|||||||
@@ -39,9 +39,11 @@ REQUEST_TIMEOUT = 20
|
|||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": (
|
"User-Agent": (
|
||||||
"Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; "
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
"+https://activeblue.net)"
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||||
),
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODULE_PATHS = {
|
MODULE_PATHS = {
|
||||||
@@ -238,7 +240,7 @@ def clean_text(soup: BeautifulSoup) -> tuple:
|
|||||||
return clean, headings
|
return clean, headings
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(url: str) -> DocPage | None:
|
def fetch_page(url: str, debug: bool = False) -> DocPage | None:
|
||||||
for attempt in range(MAX_RETRIES):
|
for attempt in range(MAX_RETRIES):
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||||
@@ -253,7 +255,11 @@ def fetch_page(url: str) -> DocPage | None:
|
|||||||
title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()
|
title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()
|
||||||
|
|
||||||
text, headings = clean_text(soup)
|
text, headings = clean_text(soup)
|
||||||
|
if debug:
|
||||||
|
log.info(f" DEBUG title={title!r} text_len={len(text)} snippet={text[:120]!r}")
|
||||||
if len(text) < 100:
|
if len(text) < 100:
|
||||||
|
if not debug:
|
||||||
|
log.warning(f" Too short ({len(text)} chars): {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return DocPage(
|
return DocPage(
|
||||||
@@ -275,7 +281,7 @@ def fetch_page(url: str) -> DocPage | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def crawl(module: str | None = None, limit: int | None = None):
|
def crawl(module: str | None = None, limit: int | None = None, debug: bool = False):
|
||||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -296,7 +302,7 @@ def crawl(module: str | None = None, limit: int | None = None):
|
|||||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||||
for i, url in enumerate(urls, 1):
|
for i, url in enumerate(urls, 1):
|
||||||
log.info(f"[{i}/{len(urls)}] {url}")
|
log.info(f"[{i}/{len(urls)}] {url}")
|
||||||
page = fetch_page(url)
|
page = fetch_page(url, debug=debug)
|
||||||
if page:
|
if page:
|
||||||
f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
|
f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
|
||||||
written += 1
|
written += 1
|
||||||
@@ -312,5 +318,6 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
|
parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
|
||||||
parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
|
parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
|
||||||
parser.add_argument("--limit", type=int, help="Max pages (for testing)")
|
parser.add_argument("--limit", type=int, help="Max pages (for testing)")
|
||||||
|
parser.add_argument("--debug", action="store_true", help="Print extracted text snippets per page")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
crawl(module=args.module, limit=args.limit)
|
crawl(module=args.module, limit=args.limit, debug=args.debug)
|
||||||
|
|||||||
Reference in New Issue
Block a user