The Odoo 18 sitemap.xml returns 404. The fallback URL list also failed because urljoin(BASE_URL, /applications/...) strips the /documentation/18.0 path (absolute path arg replaces the whole path component in urljoin). Changes: - Add discover_urls_by_crawl(): fetches each module index page and collects all internal links — replaces sitemap as primary source - crawl() now chains: sitemap → crawl discovery → hardcoded fallback - Fix fallback_urls() to use BASE_URL + path (not urljoin) and trim the list to known-good pages - Keep crawl discovery rate-limited (0.5s between module seeds) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
341 lines
12 KiB
Python
341 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Odoo 18 Documentation Scraper
|
|
==============================
|
|
Crawls the Odoo 18 docs sitemap, extracts clean text from each page,
|
|
and saves structured JSON ready for the indexer.
|
|
|
|
Usage:
|
|
python scraper.py # full crawl
|
|
python scraper.py --module accounting # single module
|
|
python scraper.py --limit 50 # test run
|
|
|
|
Output: ../data/raw/odoo18_docs_raw.jsonl
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import re
|
|
import argparse
|
|
import hashlib
|
|
import logging
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
from dataclasses import dataclass, asdict
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
log = logging.getLogger(__name__)
|
|
|
|
BASE_URL = "https://www.odoo.com/documentation/18.0"
|
|
SITEMAP_URL = f"{BASE_URL}/sitemap.xml"
|
|
OUTPUT_DIR = Path("../data/raw")
|
|
OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl"
|
|
DELAY_SECONDS = 1.2
|
|
MAX_RETRIES = 3
|
|
REQUEST_TIMEOUT = 20
|
|
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
}
|
|
|
|
MODULE_PATHS = {
|
|
"accounting": "/applications/finance/accounting",
|
|
"invoicing": "/applications/finance",
|
|
"inventory": "/applications/inventory_and_mrp/inventory",
|
|
"purchase": "/applications/inventory_and_mrp/purchase",
|
|
"manufacturing": "/applications/inventory_and_mrp/manufacturing",
|
|
"sales": "/applications/sales/sales",
|
|
"crm": "/applications/sales/crm",
|
|
"employees": "/applications/hr/employees",
|
|
"payroll": "/applications/hr/payroll",
|
|
"timesheets": "/applications/services/timesheets",
|
|
"project": "/applications/services/project",
|
|
"helpdesk": "/applications/services/helpdesk",
|
|
"ecommerce": "/applications/websites/ecommerce",
|
|
"website": "/applications/websites/website",
|
|
"marketing": "/applications/marketing",
|
|
"pos": "/applications/sales/point_of_sale",
|
|
"quality": "/applications/inventory_and_mrp/quality",
|
|
"maintenance": "/applications/inventory_and_mrp/maintenance",
|
|
"fleet": "/applications/hr/fleet",
|
|
"discuss": "/applications/productivity/discuss",
|
|
"studio": "/applications/studio",
|
|
"general": "/applications/general",
|
|
"install": "/administration",
|
|
}
|
|
|
|
NOISE_SELECTORS = [
|
|
"nav", "footer", "header", ".toctree-wrapper",
|
|
".wy-nav-side", ".wy-menu", ".wy-side-nav-search",
|
|
".rst-footer-buttons", "#edit-on-github",
|
|
"[role='navigation']", ".breadcrumbs",
|
|
".sidebar", ".sphinxsidebar",
|
|
"script", "style",
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class DocPage:
|
|
url: str
|
|
title: str
|
|
module: str
|
|
section: str
|
|
text: str
|
|
headings: list
|
|
doc_id: str
|
|
|
|
|
|
def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
|
|
log.info(f"Fetching sitemap: {sitemap_url}")
|
|
resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "xml")
|
|
all_urls = [loc.text.strip() for loc in soup.find_all("loc")]
|
|
urls = [
|
|
u for u in all_urls
|
|
if "/18.0/" in u or "/documentation/18.0" in u
|
|
if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"])
|
|
]
|
|
if module_filter:
|
|
path = MODULE_PATHS.get(module_filter)
|
|
if not path:
|
|
raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}")
|
|
urls = [u for u in urls if path in u]
|
|
log.info(f"Module filter '{module_filter}': {len(urls)} pages")
|
|
else:
|
|
log.info(f"Total pages: {len(urls)}")
|
|
return urls
|
|
|
|
|
|
def discover_urls_by_crawl(module_filter: str | None = None) -> list:
|
|
"""
|
|
Discover doc URLs by fetching each module's index page and collecting
|
|
every internal link that stays within that module's path.
|
|
This replaces the sitemap when it is unavailable.
|
|
"""
|
|
if module_filter:
|
|
path = MODULE_PATHS.get(module_filter)
|
|
if not path:
|
|
raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}")
|
|
module_paths = {module_filter: path}
|
|
else:
|
|
module_paths = MODULE_PATHS
|
|
|
|
found: set = set()
|
|
|
|
for module, path in module_paths.items():
|
|
# Module index pages are at <path>.html or <path>/index.html
|
|
seeds = [
|
|
BASE_URL + path + ".html",
|
|
BASE_URL + path + "/index.html",
|
|
]
|
|
for seed in seeds:
|
|
try:
|
|
resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
|
if resp.status_code != 200:
|
|
continue
|
|
found.add(seed)
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
prefix = BASE_URL + path
|
|
for a in soup.find_all("a", href=True):
|
|
abs_url = urljoin(seed, a["href"]).split("#")[0]
|
|
if abs_url.startswith(prefix) and abs_url not in found:
|
|
found.add(abs_url)
|
|
log.info(f" {module}: {len([u for u in found if path in u])} URLs from {seed}")
|
|
time.sleep(0.5)
|
|
break # one successful seed per module is enough
|
|
except Exception as e:
|
|
log.debug(f" seed {seed} failed: {e}")
|
|
|
|
urls = sorted(found)
|
|
log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules")
|
|
return urls
|
|
|
|
|
|
def fallback_urls() -> list:
|
|
"""Last-resort hardcoded list when both sitemap and crawl discovery fail."""
|
|
paths = [
|
|
"/applications/finance/accounting.html",
|
|
"/applications/finance/accounting/customer_invoices.html",
|
|
"/applications/finance/accounting/vendor_bills.html",
|
|
"/applications/finance/accounting/get_started/chart_of_accounts.html",
|
|
"/applications/finance/accounting/taxes.html",
|
|
"/applications/finance/accounting/reporting.html",
|
|
"/applications/finance/accounting/bank.html",
|
|
"/applications/finance/expenses.html",
|
|
"/applications/finance/payment_providers.html",
|
|
"/applications/sales/sales.html",
|
|
"/applications/sales/crm.html",
|
|
"/applications/sales/crm/pipeline.html",
|
|
"/applications/inventory_and_mrp/inventory.html",
|
|
"/applications/inventory_and_mrp/purchase.html",
|
|
"/applications/inventory_and_mrp/manufacturing.html",
|
|
"/applications/hr/employees.html",
|
|
"/applications/hr/payroll.html",
|
|
"/applications/hr/payroll/payslips.html",
|
|
"/applications/websites/ecommerce.html",
|
|
"/applications/services/helpdesk.html",
|
|
"/applications/services/project.html",
|
|
]
|
|
# NOTE: urljoin drops the /documentation/18.0 path when given an absolute
|
|
# path arg — use string concatenation instead.
|
|
return [BASE_URL + p for p in paths]
|
|
|
|
|
|
def infer_module(url: str) -> str:
|
|
for module, path in MODULE_PATHS.items():
|
|
if path.lstrip("/") in url:
|
|
return module
|
|
return "general"
|
|
|
|
|
|
def extract_section(soup: BeautifulSoup) -> str:
|
|
bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a")
|
|
if bc:
|
|
return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True))
|
|
h1 = soup.find("h1")
|
|
return h1.get_text(strip=True) if h1 else "Odoo 18 Docs"
|
|
|
|
|
|
def clean_text(soup: BeautifulSoup) -> tuple:
|
|
for sel in NOISE_SELECTORS:
|
|
for el in soup.select(sel):
|
|
el.decompose()
|
|
|
|
content = (
|
|
soup.find("div", {"class": "document"})
|
|
or soup.find("article")
|
|
or soup.find("main")
|
|
or soup.find("div", {"role": "main"})
|
|
or soup.find("body")
|
|
)
|
|
if not content:
|
|
return "", []
|
|
|
|
headings = []
|
|
lines = []
|
|
|
|
for el in content.descendants:
|
|
if not hasattr(el, "name"):
|
|
continue
|
|
if el.name in ("h1", "h2", "h3", "h4"):
|
|
text = el.get_text(strip=True)
|
|
if text:
|
|
prefix = "#" * int(el.name[1])
|
|
lines.append(f"\n{prefix} {text}\n")
|
|
if el.name in ("h2", "h3"):
|
|
headings.append(text)
|
|
elif el.name == "p":
|
|
text = el.get_text(separator=" ", strip=True)
|
|
if text and len(text) > 20:
|
|
lines.append(text)
|
|
elif el.name == "li":
|
|
text = el.get_text(separator=" ", strip=True)
|
|
if text and len(text) > 5:
|
|
lines.append(f"- {text}")
|
|
elif el.name == "code":
|
|
text = el.get_text(strip=True)
|
|
if text:
|
|
lines.append(f"`{text}`")
|
|
|
|
raw = "\n".join(lines)
|
|
clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
|
|
return clean, headings
|
|
|
|
|
|
def fetch_page(url: str, debug: bool = False) -> DocPage | None:
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
|
if resp.status_code == 404:
|
|
log.warning(f"404: {url}")
|
|
return None
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
title_tag = soup.find("title")
|
|
title = title_tag.get_text(strip=True) if title_tag else url
|
|
title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()
|
|
|
|
text, headings = clean_text(soup)
|
|
if debug:
|
|
log.info(f" DEBUG title={title!r} text_len={len(text)} snippet={text[:120]!r}")
|
|
if len(text) < 100:
|
|
if not debug:
|
|
log.warning(f" Too short ({len(text)} chars): {url}")
|
|
return None
|
|
|
|
return DocPage(
|
|
url=url,
|
|
title=title,
|
|
module=infer_module(url),
|
|
section=extract_section(soup),
|
|
text=text,
|
|
headings=headings,
|
|
doc_id=hashlib.sha256(url.encode()).hexdigest()[:16],
|
|
)
|
|
except requests.RequestException as e:
|
|
if attempt < MAX_RETRIES - 1:
|
|
wait = 2 ** attempt
|
|
log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)")
|
|
time.sleep(wait)
|
|
else:
|
|
log.error(f"Failed: {url}: {e}")
|
|
return None
|
|
|
|
|
|
def crawl(module: str | None = None, limit: int | None = None, debug: bool = False):
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
urls = fetch_sitemap_urls(SITEMAP_URL, module)
|
|
if not urls:
|
|
raise ValueError("Sitemap returned 0 matching URLs")
|
|
except Exception as e:
|
|
log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery")
|
|
try:
|
|
urls = discover_urls_by_crawl(module)
|
|
except Exception as e2:
|
|
log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list")
|
|
urls = fallback_urls()
|
|
if module:
|
|
path = MODULE_PATHS.get(module, "")
|
|
urls = [u for u in urls if path in u]
|
|
|
|
if limit:
|
|
urls = urls[:limit]
|
|
|
|
log.info(f"Crawling {len(urls)} pages...")
|
|
written = skipped = 0
|
|
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
for i, url in enumerate(urls, 1):
|
|
log.info(f"[{i}/{len(urls)}] {url}")
|
|
page = fetch_page(url, debug=debug)
|
|
if page:
|
|
f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
|
|
written += 1
|
|
else:
|
|
skipped += 1
|
|
time.sleep(DELAY_SECONDS)
|
|
|
|
log.info(f"\n✅ Done. Written: {written}, Skipped: {skipped}")
|
|
log.info(f" Output: {OUTPUT_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
|
|
parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
|
|
parser.add_argument("--limit", type=int, help="Max pages (for testing)")
|
|
parser.add_argument("--debug", action="store_true", help="Print extracted text snippets per page")
|
|
args = parser.parse_args()
|
|
crawl(module=args.module, limit=args.limit, debug=args.debug)
|