#!/usr/bin/env python3 """ Odoo 18 Documentation Scraper ============================== Crawls the Odoo 18 docs sitemap, extracts clean text from each page, and saves structured JSON ready for the indexer. Usage: python scraper.py # full crawl python scraper.py --module accounting # single module python scraper.py --limit 50 # test run Output: ../data/raw/odoo18_docs_raw.jsonl """ import json import time import re import argparse import hashlib import logging from pathlib import Path from urllib.parse import urljoin from dataclasses import dataclass, asdict import requests from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") log = logging.getLogger(__name__) BASE_URL = "https://www.odoo.com/documentation/18.0" SITEMAP_URL = f"{BASE_URL}/sitemap.xml" OUTPUT_DIR = Path("../data/raw") OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl" DELAY_SECONDS = 1.2 MAX_RETRIES = 3 REQUEST_TIMEOUT = 20 HEADERS = { "User-Agent": ( "Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; " "+https://activeblue.net)" ), } MODULE_PATHS = { "accounting": "/applications/finance/accounting", "invoicing": "/applications/finance", "inventory": "/applications/inventory_and_mrp/inventory", "purchase": "/applications/inventory_and_mrp/purchase", "manufacturing": "/applications/inventory_and_mrp/manufacturing", "sales": "/applications/sales/sales", "crm": "/applications/sales/crm", "employees": "/applications/hr/employees", "payroll": "/applications/hr/payroll", "timesheets": "/applications/services/timesheets", "project": "/applications/services/project", "helpdesk": "/applications/services/helpdesk", "ecommerce": "/applications/websites/ecommerce", "website": "/applications/websites/website", "marketing": "/applications/marketing", "pos": "/applications/sales/point_of_sale", "quality": "/applications/inventory_and_mrp/quality", "maintenance": "/applications/inventory_and_mrp/maintenance", "fleet": "/applications/hr/fleet", "discuss": "/applications/productivity/discuss", "studio": "/applications/studio", "general": "/applications/general", "install": "/administration", } NOISE_SELECTORS = [ "nav", "footer", "header", ".toctree-wrapper", ".wy-nav-side", ".wy-menu", ".wy-side-nav-search", ".rst-footer-buttons", "#edit-on-github", "[role='navigation']", ".breadcrumbs", ".sidebar", ".sphinxsidebar", "script", "style", ] @dataclass class DocPage: url: str title: str module: str section: str text: str headings: list doc_id: str def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list: log.info(f"Fetching sitemap: {sitemap_url}") resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT) resp.raise_for_status() soup = BeautifulSoup(resp.text, "xml") all_urls = [loc.text.strip() for loc in soup.find_all("loc")] urls = [ u for u in all_urls if "/18.0/" in u or "/documentation/18.0" in u if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"]) ] if module_filter: path = MODULE_PATHS.get(module_filter) if not path: raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}") urls = [u for u in urls if path in u] log.info(f"Module filter '{module_filter}': {len(urls)} pages") else: log.info(f"Total pages: {len(urls)}") return urls def fallback_urls() -> list: """Curated fallback list if sitemap is unavailable.""" paths = [ "/applications/finance/accounting.html", "/applications/finance/accounting/customer_invoices.html", "/applications/finance/accounting/customer_invoices/overview.html", "/applications/finance/accounting/vendor_bills.html", "/applications/finance/accounting/get_started/chart_of_accounts.html", "/applications/finance/accounting/get_started/cheat_sheet.html", "/applications/finance/accounting/get_started/multi_currency.html", "/applications/finance/accounting/reporting/budget.html", "/applications/finance/accounting/reporting/analytic_accounting.html", "/applications/finance/accounting/bank.html", "/applications/finance/accounting/taxes.html", "/applications/finance/accounting/reporting.html", "/applications/finance/expenses.html", "/applications/finance/expenses/reinvoice_expenses.html", "/applications/finance/payment_providers.html", "/applications/finance.html", "/applications/sales.html", "/applications/sales/sales.html", "/applications/sales/crm.html", "/applications/sales/crm/pipeline.html", "/applications/sales/crm/acquire_leads/email_manual.html", "/applications/sales/crm/pipeline/manage_sales_teams.html", "/applications/sales/crm/optimize/utilize_activities.html", "/applications/inventory_and_mrp/inventory.html", "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html", "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html", "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html", "/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html", "/applications/inventory_and_mrp/purchase.html", "/applications/inventory_and_mrp/purchase/manage_deals/rfq.html", "/applications/inventory_and_mrp/purchase/manage_deals/manage.html", "/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html", "/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html", "/applications/inventory_and_mrp/manufacturing.html", "/applications/inventory_and_mrp/manufacturing/workflows.html", "/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html", "/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html", "/applications/inventory_and_mrp/manufacturing/subcontracting.html", "/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html", "/applications/hr.html", "/applications/hr/employees.html", "/applications/hr/employees/new_employee.html", "/applications/hr/payroll.html", "/applications/hr/payroll/contracts.html", "/applications/hr/payroll/payslips.html", "/applications/hr/payroll/batches.html", "/applications/websites/ecommerce.html", "/applications/websites/ecommerce/products.html", "/applications/websites/ecommerce/checkout_payment_shipping/checkout.html", "/applications/websites/ecommerce/checkout_payment_shipping/payments.html", "/applications/websites/ecommerce/customer_accounts.html", "/applications/services/helpdesk.html", "/applications/services/helpdesk/advanced/after_sales.html", "/applications/services/project.html", "/applications/finance/fiscal_localizations/united_states.html", "/applications.html", "/applications/general.html", ] return [urljoin(BASE_URL, p) for p in paths] def infer_module(url: str) -> str: for module, path in MODULE_PATHS.items(): if path.lstrip("/") in url: return module return "general" def extract_section(soup: BeautifulSoup) -> str: bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a") if bc: return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True)) h1 = soup.find("h1") return h1.get_text(strip=True) if h1 else "Odoo 18 Docs" def clean_text(soup: BeautifulSoup) -> tuple: for sel in NOISE_SELECTORS: for el in soup.select(sel): el.decompose() content = ( soup.find("div", {"class": "document"}) or soup.find("article") or soup.find("main") or soup.find("div", {"role": "main"}) or soup.find("body") ) if not content: return "", [] headings = [] lines = [] for el in content.descendants: if not hasattr(el, "name"): continue if el.name in ("h1", "h2", "h3", "h4"): text = el.get_text(strip=True) if text: prefix = "#" * int(el.name[1]) lines.append(f"\n{prefix} {text}\n") if el.name in ("h2", "h3"): headings.append(text) elif el.name == "p": text = el.get_text(separator=" ", strip=True) if text and len(text) > 20: lines.append(text) elif el.name == "li": text = el.get_text(separator=" ", strip=True) if text and len(text) > 5: lines.append(f"- {text}") elif el.name == "code": text = el.get_text(strip=True) if text: lines.append(f"`{text}`") raw = "\n".join(lines) clean = re.sub(r"\n{3,}", "\n\n", raw).strip() return clean, headings def fetch_page(url: str) -> DocPage | None: for attempt in range(MAX_RETRIES): try: resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) if resp.status_code == 404: log.warning(f"404: {url}") return None resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else url title = re.sub(r"\s*—\s*Odoo.*", "", title).strip() text, headings = clean_text(soup) if len(text) < 100: return None return DocPage( url=url, title=title, module=infer_module(url), section=extract_section(soup), text=text, headings=headings, doc_id=hashlib.sha256(url.encode()).hexdigest()[:16], ) except requests.RequestException as e: if attempt < MAX_RETRIES - 1: wait = 2 ** attempt log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)") time.sleep(wait) else: log.error(f"Failed: {url}: {e}") return None def crawl(module: str | None = None, limit: int | None = None): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) try: urls = fetch_sitemap_urls(SITEMAP_URL, module) except Exception as e: log.warning(f"Sitemap unavailable ({e}), using fallback list") urls = fallback_urls() if module: path = MODULE_PATHS.get(module, "") urls = [u for u in urls if path.lstrip("/") in u] if limit: urls = urls[:limit] log.info(f"Crawling {len(urls)} pages...") written = skipped = 0 with open(OUTPUT_FILE, "w", encoding="utf-8") as f: for i, url in enumerate(urls, 1): log.info(f"[{i}/{len(urls)}] {url}") page = fetch_page(url) if page: f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n") written += 1 else: skipped += 1 time.sleep(DELAY_SECONDS) log.info(f"\nāœ… Done. Written: {written}, Skipped: {skipped}") log.info(f" Output: {OUTPUT_FILE}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Odoo 18 docs scraper") parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}") parser.add_argument("--limit", type=int, help="Max pages (for testing)") args = parser.parse_args() crawl(module=args.module, limit=args.limit)