Initial commit: Odoo 18 RAG stack

Scraper, indexer, and FastAPI query service for Retrieval-Augmented
Generation over Odoo 18 documentation. Uses Qdrant + Ollama (nomic-embed-text
+ llama3.1). Integrates with ActiveBlue PeerBus agent interface.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-14 11:25:55 -04:00
commit 7fb1573bac
10 changed files with 1295 additions and 0 deletions

316
scraper/scraper.py Normal file
View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python3
"""
Odoo 18 Documentation Scraper
==============================
Crawls the Odoo 18 docs sitemap, extracts clean text from each page,
and saves structured JSON ready for the indexer.
Usage:
python scraper.py # full crawl
python scraper.py --module accounting # single module
python scraper.py --limit 50 # test run
Output: ../data/raw/odoo18_docs_raw.jsonl
"""
import json
import time
import re
import argparse
import hashlib
import logging
from pathlib import Path
from urllib.parse import urljoin
from dataclasses import dataclass, asdict
import requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
log = logging.getLogger(__name__)
BASE_URL = "https://www.odoo.com/documentation/18.0"
SITEMAP_URL = f"{BASE_URL}/sitemap.xml"
OUTPUT_DIR = Path("../data/raw")
OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl"
DELAY_SECONDS = 1.2
MAX_RETRIES = 3
REQUEST_TIMEOUT = 20
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; "
"+https://activeblue.net)"
),
}
MODULE_PATHS = {
"accounting": "/applications/finance/accounting",
"invoicing": "/applications/finance",
"inventory": "/applications/inventory_and_mrp/inventory",
"purchase": "/applications/inventory_and_mrp/purchase",
"manufacturing": "/applications/inventory_and_mrp/manufacturing",
"sales": "/applications/sales/sales",
"crm": "/applications/sales/crm",
"employees": "/applications/hr/employees",
"payroll": "/applications/hr/payroll",
"timesheets": "/applications/services/timesheets",
"project": "/applications/services/project",
"helpdesk": "/applications/services/helpdesk",
"ecommerce": "/applications/websites/ecommerce",
"website": "/applications/websites/website",
"marketing": "/applications/marketing",
"pos": "/applications/sales/point_of_sale",
"quality": "/applications/inventory_and_mrp/quality",
"maintenance": "/applications/inventory_and_mrp/maintenance",
"fleet": "/applications/hr/fleet",
"discuss": "/applications/productivity/discuss",
"studio": "/applications/studio",
"general": "/applications/general",
"install": "/administration",
}
NOISE_SELECTORS = [
"nav", "footer", "header", ".toctree-wrapper",
".wy-nav-side", ".wy-menu", ".wy-side-nav-search",
".rst-footer-buttons", "#edit-on-github",
"[role='navigation']", ".breadcrumbs",
".sidebar", ".sphinxsidebar",
"script", "style",
]
@dataclass
class DocPage:
url: str
title: str
module: str
section: str
text: str
headings: list
doc_id: str
def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
log.info(f"Fetching sitemap: {sitemap_url}")
resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "xml")
all_urls = [loc.text.strip() for loc in soup.find_all("loc")]
urls = [
u for u in all_urls
if "/18.0/" in u or "/documentation/18.0" in u
if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"])
]
if module_filter:
path = MODULE_PATHS.get(module_filter)
if not path:
raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}")
urls = [u for u in urls if path in u]
log.info(f"Module filter '{module_filter}': {len(urls)} pages")
else:
log.info(f"Total pages: {len(urls)}")
return urls
def fallback_urls() -> list:
"""Curated fallback list if sitemap is unavailable."""
paths = [
"/applications/finance/accounting.html",
"/applications/finance/accounting/customer_invoices.html",
"/applications/finance/accounting/customer_invoices/overview.html",
"/applications/finance/accounting/vendor_bills.html",
"/applications/finance/accounting/get_started/chart_of_accounts.html",
"/applications/finance/accounting/get_started/cheat_sheet.html",
"/applications/finance/accounting/get_started/multi_currency.html",
"/applications/finance/accounting/reporting/budget.html",
"/applications/finance/accounting/reporting/analytic_accounting.html",
"/applications/finance/accounting/bank.html",
"/applications/finance/accounting/taxes.html",
"/applications/finance/accounting/reporting.html",
"/applications/finance/expenses.html",
"/applications/finance/expenses/reinvoice_expenses.html",
"/applications/finance/payment_providers.html",
"/applications/finance.html",
"/applications/sales.html",
"/applications/sales/sales.html",
"/applications/sales/crm.html",
"/applications/sales/crm/pipeline.html",
"/applications/sales/crm/acquire_leads/email_manual.html",
"/applications/sales/crm/pipeline/manage_sales_teams.html",
"/applications/sales/crm/optimize/utilize_activities.html",
"/applications/inventory_and_mrp/inventory.html",
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
"/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
"/applications/inventory_and_mrp/purchase.html",
"/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
"/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
"/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
"/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
"/applications/inventory_and_mrp/manufacturing.html",
"/applications/inventory_and_mrp/manufacturing/workflows.html",
"/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
"/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
"/applications/inventory_and_mrp/manufacturing/subcontracting.html",
"/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
"/applications/hr.html",
"/applications/hr/employees.html",
"/applications/hr/employees/new_employee.html",
"/applications/hr/payroll.html",
"/applications/hr/payroll/contracts.html",
"/applications/hr/payroll/payslips.html",
"/applications/hr/payroll/batches.html",
"/applications/websites/ecommerce.html",
"/applications/websites/ecommerce/products.html",
"/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
"/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
"/applications/websites/ecommerce/customer_accounts.html",
"/applications/services/helpdesk.html",
"/applications/services/helpdesk/advanced/after_sales.html",
"/applications/services/project.html",
"/applications/finance/fiscal_localizations/united_states.html",
"/applications.html",
"/applications/general.html",
]
return [urljoin(BASE_URL, p) for p in paths]
def infer_module(url: str) -> str:
for module, path in MODULE_PATHS.items():
if path.lstrip("/") in url:
return module
return "general"
def extract_section(soup: BeautifulSoup) -> str:
bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a")
if bc:
return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True))
h1 = soup.find("h1")
return h1.get_text(strip=True) if h1 else "Odoo 18 Docs"
def clean_text(soup: BeautifulSoup) -> tuple:
for sel in NOISE_SELECTORS:
for el in soup.select(sel):
el.decompose()
content = (
soup.find("div", {"class": "document"})
or soup.find("article")
or soup.find("main")
or soup.find("div", {"role": "main"})
or soup.find("body")
)
if not content:
return "", []
headings = []
lines = []
for el in content.descendants:
if not hasattr(el, "name"):
continue
if el.name in ("h1", "h2", "h3", "h4"):
text = el.get_text(strip=True)
if text:
prefix = "#" * int(el.name[1])
lines.append(f"\n{prefix} {text}\n")
if el.name in ("h2", "h3"):
headings.append(text)
elif el.name == "p":
text = el.get_text(separator=" ", strip=True)
if text and len(text) > 20:
lines.append(text)
elif el.name == "li":
text = el.get_text(separator=" ", strip=True)
if text and len(text) > 5:
lines.append(f"- {text}")
elif el.name == "code":
text = el.get_text(strip=True)
if text:
lines.append(f"`{text}`")
raw = "\n".join(lines)
clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
return clean, headings
def fetch_page(url: str) -> DocPage | None:
for attempt in range(MAX_RETRIES):
try:
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
if resp.status_code == 404:
log.warning(f"404: {url}")
return None
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
title_tag = soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else url
title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()
text, headings = clean_text(soup)
if len(text) < 100:
return None
return DocPage(
url=url,
title=title,
module=infer_module(url),
section=extract_section(soup),
text=text,
headings=headings,
doc_id=hashlib.sha256(url.encode()).hexdigest()[:16],
)
except requests.RequestException as e:
if attempt < MAX_RETRIES - 1:
wait = 2 ** attempt
log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)")
time.sleep(wait)
else:
log.error(f"Failed: {url}: {e}")
return None
def crawl(module: str | None = None, limit: int | None = None):
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
try:
urls = fetch_sitemap_urls(SITEMAP_URL, module)
except Exception as e:
log.warning(f"Sitemap unavailable ({e}), using fallback list")
urls = fallback_urls()
if module:
path = MODULE_PATHS.get(module, "")
urls = [u for u in urls if path.lstrip("/") in u]
if limit:
urls = urls[:limit]
log.info(f"Crawling {len(urls)} pages...")
written = skipped = 0
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for i, url in enumerate(urls, 1):
log.info(f"[{i}/{len(urls)}] {url}")
page = fetch_page(url)
if page:
f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
written += 1
else:
skipped += 1
time.sleep(DELAY_SECONDS)
log.info(f"\n✅ Done. Written: {written}, Skipped: {skipped}")
log.info(f" Output: {OUTPUT_FILE}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
parser.add_argument("--limit", type=int, help="Max pages (for testing)")
args = parser.parse_args()
crawl(module=args.module, limit=args.limit)