odootrain/scraper/scraper.py

#!/usr/bin/env python3
"""
Odoo 18 Documentation Scraper
==============================
Crawls the Odoo 18 docs sitemap, extracts clean text from each page,
and saves structured JSON ready for the indexer.

Usage:
    python scraper.py                      # full crawl
    python scraper.py --module accounting  # single module
    python scraper.py --limit 50           # test run

Output: ../data/raw/odoo18_docs_raw.jsonl
"""

import json
import time
import re
import argparse
import hashlib
import logging
from pathlib import Path
from urllib.parse import urljoin
from dataclasses import dataclass, asdict

import requests
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format="%(levelname)s  %(message)s")
log = logging.getLogger(__name__)

BASE_URL        = "https://www.odoo.com/documentation/18.0"
SITEMAP_URL     = f"{BASE_URL}/sitemap.xml"
OUTPUT_DIR      = Path("../data/raw")
OUTPUT_FILE     = OUTPUT_DIR / "odoo18_docs_raw.jsonl"
DELAY_SECONDS   = 1.2
MAX_RETRIES     = 3
REQUEST_TIMEOUT = 20

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

MODULE_PATHS = {
    "accounting":    "/applications/finance/accounting",
    "invoicing":     "/applications/finance",
    "inventory":     "/applications/inventory_and_mrp/inventory",
    "purchase":      "/applications/inventory_and_mrp/purchase",
    "manufacturing": "/applications/inventory_and_mrp/manufacturing",
    "sales":         "/applications/sales/sales",
    "crm":           "/applications/sales/crm",
    "employees":     "/applications/hr/employees",
    "payroll":       "/applications/hr/payroll",
    "timesheets":    "/applications/services/timesheets",
    "project":       "/applications/services/project",
    "helpdesk":      "/applications/services/helpdesk",
    "ecommerce":     "/applications/websites/ecommerce",
    "website":       "/applications/websites/website",
    "marketing":     "/applications/marketing",
    "pos":           "/applications/sales/point_of_sale",
    "quality":       "/applications/inventory_and_mrp/quality",
    "maintenance":   "/applications/inventory_and_mrp/maintenance",
    "fleet":         "/applications/hr/fleet",
    "discuss":       "/applications/productivity/discuss",
    "studio":        "/applications/studio",
    "general":       "/applications/general",
    "install":       "/administration",
}

NOISE_SELECTORS = [
    "nav", "footer", "header", ".toctree-wrapper",
    ".wy-nav-side", ".wy-menu", ".wy-side-nav-search",
    ".rst-footer-buttons", "#edit-on-github",
    "[role='navigation']", ".breadcrumbs",
    ".sidebar", ".sphinxsidebar",
    ".headerlink",   # Sphinx ¶ permalink anchors
    "script", "style",
]


@dataclass
class DocPage:
    url: str
    title: str
    module: str
    section: str
    text: str
    headings: list
    doc_id: str


def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
    log.info(f"Fetching sitemap: {sitemap_url}")
    resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "xml")
    all_urls = [loc.text.strip() for loc in soup.find_all("loc")]
    urls = [
        u for u in all_urls
        if "/18.0/" in u or "/documentation/18.0" in u
        if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"])
    ]
    if module_filter:
        path = MODULE_PATHS.get(module_filter)
        if not path:
            raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}")
        urls = [u for u in urls if path in u]
        log.info(f"Module filter '{module_filter}': {len(urls)} pages")
    else:
        log.info(f"Total pages: {len(urls)}")
    return urls


def discover_urls_by_crawl(module_filter: str | None = None) -> list:
    """
    Discover doc URLs by fetching each module's index page and collecting
    every internal link that stays within that module's path.
    This replaces the sitemap when it is unavailable.
    """
    if module_filter:
        path = MODULE_PATHS.get(module_filter)
        if not path:
            raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}")
        module_paths = {module_filter: path}
    else:
        module_paths = MODULE_PATHS

    found: set = set()

    for module, path in module_paths.items():
        # Module index pages are at <path>.html or <path>/index.html
        seeds = [
            BASE_URL + path + ".html",
            BASE_URL + path + "/index.html",
        ]
        for seed in seeds:
            try:
                resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT)
                if resp.status_code != 200:
                    continue
                found.add(seed)
                soup = BeautifulSoup(resp.text, "html.parser")
                prefix = BASE_URL + path
                for a in soup.find_all("a", href=True):
                    abs_url = urljoin(seed, a["href"]).split("#")[0]
                    if abs_url.startswith(prefix) and abs_url not in found:
                        found.add(abs_url)
                log.info(f"  {module}: {len([u for u in found if path in u])} URLs from {seed}")
                time.sleep(0.5)
                break  # one successful seed per module is enough
            except Exception as e:
                log.debug(f"  seed {seed} failed: {e}")

    urls = sorted(found)
    log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules")
    return urls


def fallback_urls() -> list:
    """Last-resort hardcoded list when both sitemap and crawl discovery fail."""
    paths = [
        "/applications/finance/accounting.html",
        "/applications/finance/accounting/customer_invoices.html",
        "/applications/finance/accounting/vendor_bills.html",
        "/applications/finance/accounting/get_started/chart_of_accounts.html",
        "/applications/finance/accounting/taxes.html",
        "/applications/finance/accounting/reporting.html",
        "/applications/finance/accounting/bank.html",
        "/applications/finance/expenses.html",
        "/applications/finance/payment_providers.html",
        "/applications/sales/sales.html",
        "/applications/sales/crm.html",
        "/applications/sales/crm/pipeline.html",
        "/applications/inventory_and_mrp/inventory.html",
        "/applications/inventory_and_mrp/purchase.html",
        "/applications/inventory_and_mrp/manufacturing.html",
        "/applications/hr/employees.html",
        "/applications/hr/payroll.html",
        "/applications/hr/payroll/payslips.html",
        "/applications/websites/ecommerce.html",
        "/applications/services/helpdesk.html",
        "/applications/services/project.html",
    ]
    # NOTE: urljoin drops the /documentation/18.0 path when given an absolute
    # path arg — use string concatenation instead.
    return [BASE_URL + p for p in paths]


def infer_module(url: str) -> str:
    for module, path in MODULE_PATHS.items():
        if path.lstrip("/") in url:
            return module
    return "general"


def extract_section(soup: BeautifulSoup) -> str:
    bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a")
    if bc:
        return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True))
    h1 = soup.find("h1")
    return h1.get_text(strip=True) if h1 else "Odoo 18 Docs"


def clean_text(soup: BeautifulSoup) -> tuple:
    for sel in NOISE_SELECTORS:
        for el in soup.select(sel):
            el.decompose()

    content = (
        soup.find("div", {"class": "document"})
        or soup.find("article")
        or soup.find("main")
        or soup.find("div", {"role": "main"})
        or soup.find("body")
    )
    if not content:
        return "", []

    headings = []
    lines = []

    for el in content.descendants:
        if not hasattr(el, "name"):
            continue
        if el.name in ("h1", "h2", "h3", "h4"):
            text = el.get_text(strip=True)
            if text:
                prefix = "#" * int(el.name[1])
                lines.append(f"\n{prefix} {text}\n")
                if el.name in ("h2", "h3"):
                    headings.append(text)
        elif el.name == "p":
            text = el.get_text(separator=" ", strip=True)
            if text and len(text) > 20:
                lines.append(text)
        elif el.name == "li":
            text = el.get_text(separator=" ", strip=True)
            if text and len(text) > 5:
                lines.append(f"- {text}")
        elif el.name == "code":
            text = el.get_text(strip=True)
            if text:
                lines.append(f"`{text}`")

    raw = "\n".join(lines)
    clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
    # Strip residual Sphinx pilcrow characters (¶ and its mis-decoded form Â¶)
    clean = re.sub(r"Â¶|¶", "", clean).strip()
    return clean, headings


def fetch_page(url: str, debug: bool = False) -> DocPage | None:
    for attempt in range(MAX_RETRIES):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            if resp.status_code == 404:
                log.warning(f"404: {url}")
                return None
            resp.raise_for_status()

            soup = BeautifulSoup(resp.text, "html.parser")
            title_tag = soup.find("title")
            title = title_tag.get_text(strip=True) if title_tag else url
            title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()

            text, headings = clean_text(soup)
            if debug:
                log.info(f"  DEBUG title={title!r}  text_len={len(text)}  snippet={text[:120]!r}")
            if len(text) < 100:
                if not debug:
                    log.warning(f"  Too short ({len(text)} chars): {url}")
                return None

            return DocPage(
                url=url,
                title=title,
                module=infer_module(url),
                section=extract_section(soup),
                text=text,
                headings=headings,
                doc_id=hashlib.sha256(url.encode()).hexdigest()[:16],
            )
        except requests.RequestException as e:
            if attempt < MAX_RETRIES - 1:
                wait = 2 ** attempt
                log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)")
                time.sleep(wait)
            else:
                log.error(f"Failed: {url}: {e}")
                return None


def crawl(module: str | None = None, limit: int | None = None, debug: bool = False):
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    try:
        urls = fetch_sitemap_urls(SITEMAP_URL, module)
        if not urls:
            raise ValueError("Sitemap returned 0 matching URLs")
    except Exception as e:
        log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery")
        try:
            urls = discover_urls_by_crawl(module)
        except Exception as e2:
            log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list")
            urls = fallback_urls()
            if module:
                path = MODULE_PATHS.get(module, "")
                urls = [u for u in urls if path in u]

    if limit:
        urls = urls[:limit]

    log.info(f"Crawling {len(urls)} pages...")
    written = skipped = 0

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for i, url in enumerate(urls, 1):
            log.info(f"[{i}/{len(urls)}] {url}")
            page = fetch_page(url, debug=debug)
            if page:
                f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
                written += 1
            else:
                skipped += 1
            time.sleep(DELAY_SECONDS)

    log.info(f"\n✅ Done. Written: {written}, Skipped: {skipped}")
    log.info(f"   Output: {OUTPUT_FILE}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
    parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
    parser.add_argument("--limit", type=int, help="Max pages (for testing)")
    parser.add_argument("--debug", action="store_true", help="Print extracted text snippets per page")
    args = parser.parse_args()
    crawl(module=args.module, limit=args.limit, debug=args.debug)