Scraper, indexer, and FastAPI query service for Retrieval-Augmented Generation over Odoo 18 documentation. Uses Qdrant + Ollama (nomic-embed-text + llama3.1). Integrates with ActiveBlue PeerBus agent interface. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
317 lines
12 KiB
Python
317 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Odoo 18 Documentation Scraper
|
|
==============================
|
|
Crawls the Odoo 18 docs sitemap, extracts clean text from each page,
|
|
and saves structured JSON ready for the indexer.
|
|
|
|
Usage:
|
|
python scraper.py # full crawl
|
|
python scraper.py --module accounting # single module
|
|
python scraper.py --limit 50 # test run
|
|
|
|
Output: ../data/raw/odoo18_docs_raw.jsonl
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import re
|
|
import argparse
|
|
import hashlib
|
|
import logging
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
from dataclasses import dataclass, asdict
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
log = logging.getLogger(__name__)
|
|
|
|
BASE_URL = "https://www.odoo.com/documentation/18.0"
|
|
SITEMAP_URL = f"{BASE_URL}/sitemap.xml"
|
|
OUTPUT_DIR = Path("../data/raw")
|
|
OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl"
|
|
DELAY_SECONDS = 1.2
|
|
MAX_RETRIES = 3
|
|
REQUEST_TIMEOUT = 20
|
|
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; "
|
|
"+https://activeblue.net)"
|
|
),
|
|
}
|
|
|
|
MODULE_PATHS = {
|
|
"accounting": "/applications/finance/accounting",
|
|
"invoicing": "/applications/finance",
|
|
"inventory": "/applications/inventory_and_mrp/inventory",
|
|
"purchase": "/applications/inventory_and_mrp/purchase",
|
|
"manufacturing": "/applications/inventory_and_mrp/manufacturing",
|
|
"sales": "/applications/sales/sales",
|
|
"crm": "/applications/sales/crm",
|
|
"employees": "/applications/hr/employees",
|
|
"payroll": "/applications/hr/payroll",
|
|
"timesheets": "/applications/services/timesheets",
|
|
"project": "/applications/services/project",
|
|
"helpdesk": "/applications/services/helpdesk",
|
|
"ecommerce": "/applications/websites/ecommerce",
|
|
"website": "/applications/websites/website",
|
|
"marketing": "/applications/marketing",
|
|
"pos": "/applications/sales/point_of_sale",
|
|
"quality": "/applications/inventory_and_mrp/quality",
|
|
"maintenance": "/applications/inventory_and_mrp/maintenance",
|
|
"fleet": "/applications/hr/fleet",
|
|
"discuss": "/applications/productivity/discuss",
|
|
"studio": "/applications/studio",
|
|
"general": "/applications/general",
|
|
"install": "/administration",
|
|
}
|
|
|
|
NOISE_SELECTORS = [
|
|
"nav", "footer", "header", ".toctree-wrapper",
|
|
".wy-nav-side", ".wy-menu", ".wy-side-nav-search",
|
|
".rst-footer-buttons", "#edit-on-github",
|
|
"[role='navigation']", ".breadcrumbs",
|
|
".sidebar", ".sphinxsidebar",
|
|
"script", "style",
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class DocPage:
|
|
url: str
|
|
title: str
|
|
module: str
|
|
section: str
|
|
text: str
|
|
headings: list
|
|
doc_id: str
|
|
|
|
|
|
def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
|
|
log.info(f"Fetching sitemap: {sitemap_url}")
|
|
resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "xml")
|
|
all_urls = [loc.text.strip() for loc in soup.find_all("loc")]
|
|
urls = [
|
|
u for u in all_urls
|
|
if "/18.0/" in u or "/documentation/18.0" in u
|
|
if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"])
|
|
]
|
|
if module_filter:
|
|
path = MODULE_PATHS.get(module_filter)
|
|
if not path:
|
|
raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}")
|
|
urls = [u for u in urls if path in u]
|
|
log.info(f"Module filter '{module_filter}': {len(urls)} pages")
|
|
else:
|
|
log.info(f"Total pages: {len(urls)}")
|
|
return urls
|
|
|
|
|
|
def fallback_urls() -> list:
|
|
"""Curated fallback list if sitemap is unavailable."""
|
|
paths = [
|
|
"/applications/finance/accounting.html",
|
|
"/applications/finance/accounting/customer_invoices.html",
|
|
"/applications/finance/accounting/customer_invoices/overview.html",
|
|
"/applications/finance/accounting/vendor_bills.html",
|
|
"/applications/finance/accounting/get_started/chart_of_accounts.html",
|
|
"/applications/finance/accounting/get_started/cheat_sheet.html",
|
|
"/applications/finance/accounting/get_started/multi_currency.html",
|
|
"/applications/finance/accounting/reporting/budget.html",
|
|
"/applications/finance/accounting/reporting/analytic_accounting.html",
|
|
"/applications/finance/accounting/bank.html",
|
|
"/applications/finance/accounting/taxes.html",
|
|
"/applications/finance/accounting/reporting.html",
|
|
"/applications/finance/expenses.html",
|
|
"/applications/finance/expenses/reinvoice_expenses.html",
|
|
"/applications/finance/payment_providers.html",
|
|
"/applications/finance.html",
|
|
"/applications/sales.html",
|
|
"/applications/sales/sales.html",
|
|
"/applications/sales/crm.html",
|
|
"/applications/sales/crm/pipeline.html",
|
|
"/applications/sales/crm/acquire_leads/email_manual.html",
|
|
"/applications/sales/crm/pipeline/manage_sales_teams.html",
|
|
"/applications/sales/crm/optimize/utilize_activities.html",
|
|
"/applications/inventory_and_mrp/inventory.html",
|
|
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
|
|
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
|
|
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
|
|
"/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
|
|
"/applications/inventory_and_mrp/purchase.html",
|
|
"/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
|
|
"/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
|
|
"/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
|
|
"/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
|
|
"/applications/inventory_and_mrp/manufacturing.html",
|
|
"/applications/inventory_and_mrp/manufacturing/workflows.html",
|
|
"/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
|
|
"/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
|
|
"/applications/inventory_and_mrp/manufacturing/subcontracting.html",
|
|
"/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
|
|
"/applications/hr.html",
|
|
"/applications/hr/employees.html",
|
|
"/applications/hr/employees/new_employee.html",
|
|
"/applications/hr/payroll.html",
|
|
"/applications/hr/payroll/contracts.html",
|
|
"/applications/hr/payroll/payslips.html",
|
|
"/applications/hr/payroll/batches.html",
|
|
"/applications/websites/ecommerce.html",
|
|
"/applications/websites/ecommerce/products.html",
|
|
"/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
|
|
"/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
|
|
"/applications/websites/ecommerce/customer_accounts.html",
|
|
"/applications/services/helpdesk.html",
|
|
"/applications/services/helpdesk/advanced/after_sales.html",
|
|
"/applications/services/project.html",
|
|
"/applications/finance/fiscal_localizations/united_states.html",
|
|
"/applications.html",
|
|
"/applications/general.html",
|
|
]
|
|
return [urljoin(BASE_URL, p) for p in paths]
|
|
|
|
|
|
def infer_module(url: str) -> str:
|
|
for module, path in MODULE_PATHS.items():
|
|
if path.lstrip("/") in url:
|
|
return module
|
|
return "general"
|
|
|
|
|
|
def extract_section(soup: BeautifulSoup) -> str:
|
|
bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a")
|
|
if bc:
|
|
return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True))
|
|
h1 = soup.find("h1")
|
|
return h1.get_text(strip=True) if h1 else "Odoo 18 Docs"
|
|
|
|
|
|
def clean_text(soup: BeautifulSoup) -> tuple:
|
|
for sel in NOISE_SELECTORS:
|
|
for el in soup.select(sel):
|
|
el.decompose()
|
|
|
|
content = (
|
|
soup.find("div", {"class": "document"})
|
|
or soup.find("article")
|
|
or soup.find("main")
|
|
or soup.find("div", {"role": "main"})
|
|
or soup.find("body")
|
|
)
|
|
if not content:
|
|
return "", []
|
|
|
|
headings = []
|
|
lines = []
|
|
|
|
for el in content.descendants:
|
|
if not hasattr(el, "name"):
|
|
continue
|
|
if el.name in ("h1", "h2", "h3", "h4"):
|
|
text = el.get_text(strip=True)
|
|
if text:
|
|
prefix = "#" * int(el.name[1])
|
|
lines.append(f"\n{prefix} {text}\n")
|
|
if el.name in ("h2", "h3"):
|
|
headings.append(text)
|
|
elif el.name == "p":
|
|
text = el.get_text(separator=" ", strip=True)
|
|
if text and len(text) > 20:
|
|
lines.append(text)
|
|
elif el.name == "li":
|
|
text = el.get_text(separator=" ", strip=True)
|
|
if text and len(text) > 5:
|
|
lines.append(f"- {text}")
|
|
elif el.name == "code":
|
|
text = el.get_text(strip=True)
|
|
if text:
|
|
lines.append(f"`{text}`")
|
|
|
|
raw = "\n".join(lines)
|
|
clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
|
|
return clean, headings
|
|
|
|
|
|
def fetch_page(url: str) -> DocPage | None:
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
|
if resp.status_code == 404:
|
|
log.warning(f"404: {url}")
|
|
return None
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
title_tag = soup.find("title")
|
|
title = title_tag.get_text(strip=True) if title_tag else url
|
|
title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()
|
|
|
|
text, headings = clean_text(soup)
|
|
if len(text) < 100:
|
|
return None
|
|
|
|
return DocPage(
|
|
url=url,
|
|
title=title,
|
|
module=infer_module(url),
|
|
section=extract_section(soup),
|
|
text=text,
|
|
headings=headings,
|
|
doc_id=hashlib.sha256(url.encode()).hexdigest()[:16],
|
|
)
|
|
except requests.RequestException as e:
|
|
if attempt < MAX_RETRIES - 1:
|
|
wait = 2 ** attempt
|
|
log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)")
|
|
time.sleep(wait)
|
|
else:
|
|
log.error(f"Failed: {url}: {e}")
|
|
return None
|
|
|
|
|
|
def crawl(module: str | None = None, limit: int | None = None):
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
urls = fetch_sitemap_urls(SITEMAP_URL, module)
|
|
except Exception as e:
|
|
log.warning(f"Sitemap unavailable ({e}), using fallback list")
|
|
urls = fallback_urls()
|
|
if module:
|
|
path = MODULE_PATHS.get(module, "")
|
|
urls = [u for u in urls if path.lstrip("/") in u]
|
|
|
|
if limit:
|
|
urls = urls[:limit]
|
|
|
|
log.info(f"Crawling {len(urls)} pages...")
|
|
written = skipped = 0
|
|
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
for i, url in enumerate(urls, 1):
|
|
log.info(f"[{i}/{len(urls)}] {url}")
|
|
page = fetch_page(url)
|
|
if page:
|
|
f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
|
|
written += 1
|
|
else:
|
|
skipped += 1
|
|
time.sleep(DELAY_SECONDS)
|
|
|
|
log.info(f"\n✅ Done. Written: {written}, Skipped: {skipped}")
|
|
log.info(f" Output: {OUTPUT_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
|
|
parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
|
|
parser.add_argument("--limit", type=int, help="Max pages (for testing)")
|
|
args = parser.parse_args()
|
|
crawl(module=args.module, limit=args.limit)
|