fix: replace dead sitemap with crawl-based URL discovery

The Odoo 18 sitemap.xml returns 404. The fallback URL list also failed
because urljoin(BASE_URL, /applications/...) strips the /documentation/18.0
path (absolute path arg replaces the whole path component in urljoin).

Changes:
- Add discover_urls_by_crawl(): fetches each module index page and
  collects all internal links — replaces sitemap as primary source
- crawl() now chains: sitemap → crawl discovery → hardcoded fallback
- Fix fallback_urls() to use BASE_URL + path (not urljoin) and trim
  the list to known-good pages
- Keep crawl discovery rate-limited (0.5s between module seeds)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Carlos Garcia
2026-05-14 13:05:40 -04:00
parent 3d94c4eb25
commit 608bb51943

View File

@@ -115,68 +115,79 @@ def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
return urls
def discover_urls_by_crawl(module_filter: str | None = None) -> list:
"""
Discover doc URLs by fetching each module's index page and collecting
every internal link that stays within that module's path.
This replaces the sitemap when it is unavailable.
"""
if module_filter:
path = MODULE_PATHS.get(module_filter)
if not path:
raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}")
module_paths = {module_filter: path}
else:
module_paths = MODULE_PATHS
found: set = set()
for module, path in module_paths.items():
# Module index pages are at <path>.html or <path>/index.html
seeds = [
BASE_URL + path + ".html",
BASE_URL + path + "/index.html",
]
for seed in seeds:
try:
resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT)
if resp.status_code != 200:
continue
found.add(seed)
soup = BeautifulSoup(resp.text, "html.parser")
prefix = BASE_URL + path
for a in soup.find_all("a", href=True):
abs_url = urljoin(seed, a["href"]).split("#")[0]
if abs_url.startswith(prefix) and abs_url not in found:
found.add(abs_url)
log.info(f" {module}: {len([u for u in found if path in u])} URLs from {seed}")
time.sleep(0.5)
break # one successful seed per module is enough
except Exception as e:
log.debug(f" seed {seed} failed: {e}")
urls = sorted(found)
log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules")
return urls
def fallback_urls() -> list:
"""Curated fallback list if sitemap is unavailable."""
"""Last-resort hardcoded list when both sitemap and crawl discovery fail."""
paths = [
"/applications/finance/accounting.html",
"/applications/finance/accounting/customer_invoices.html",
"/applications/finance/accounting/customer_invoices/overview.html",
"/applications/finance/accounting/vendor_bills.html",
"/applications/finance/accounting/get_started/chart_of_accounts.html",
"/applications/finance/accounting/get_started/cheat_sheet.html",
"/applications/finance/accounting/get_started/multi_currency.html",
"/applications/finance/accounting/reporting/budget.html",
"/applications/finance/accounting/reporting/analytic_accounting.html",
"/applications/finance/accounting/bank.html",
"/applications/finance/accounting/taxes.html",
"/applications/finance/accounting/reporting.html",
"/applications/finance/accounting/bank.html",
"/applications/finance/expenses.html",
"/applications/finance/expenses/reinvoice_expenses.html",
"/applications/finance/payment_providers.html",
"/applications/finance.html",
"/applications/sales.html",
"/applications/sales/sales.html",
"/applications/sales/crm.html",
"/applications/sales/crm/pipeline.html",
"/applications/sales/crm/acquire_leads/email_manual.html",
"/applications/sales/crm/pipeline/manage_sales_teams.html",
"/applications/sales/crm/optimize/utilize_activities.html",
"/applications/inventory_and_mrp/inventory.html",
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
"/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
"/applications/inventory_and_mrp/purchase.html",
"/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
"/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
"/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
"/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
"/applications/inventory_and_mrp/manufacturing.html",
"/applications/inventory_and_mrp/manufacturing/workflows.html",
"/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
"/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
"/applications/inventory_and_mrp/manufacturing/subcontracting.html",
"/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
"/applications/hr.html",
"/applications/hr/employees.html",
"/applications/hr/employees/new_employee.html",
"/applications/hr/payroll.html",
"/applications/hr/payroll/contracts.html",
"/applications/hr/payroll/payslips.html",
"/applications/hr/payroll/batches.html",
"/applications/websites/ecommerce.html",
"/applications/websites/ecommerce/products.html",
"/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
"/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
"/applications/websites/ecommerce/customer_accounts.html",
"/applications/services/helpdesk.html",
"/applications/services/helpdesk/advanced/after_sales.html",
"/applications/services/project.html",
"/applications/finance/fiscal_localizations/united_states.html",
"/applications.html",
"/applications/general.html",
]
return [urljoin(BASE_URL, p) for p in paths]
# NOTE: urljoin drops the /documentation/18.0 path when given an absolute
# path arg — use string concatenation instead.
return [BASE_URL + p for p in paths]
def infer_module(url: str) -> str:
@@ -286,12 +297,18 @@ def crawl(module: str | None = None, limit: int | None = None, debug: bool = Fal
try:
urls = fetch_sitemap_urls(SITEMAP_URL, module)
if not urls:
raise ValueError("Sitemap returned 0 matching URLs")
except Exception as e:
log.warning(f"Sitemap unavailable ({e}), using fallback list")
urls = fallback_urls()
if module:
path = MODULE_PATHS.get(module, "")
urls = [u for u in urls if path.lstrip("/") in u]
log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery")
try:
urls = discover_urls_by_crawl(module)
except Exception as e2:
log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list")
urls = fallback_urls()
if module:
path = MODULE_PATHS.get(module, "")
urls = [u for u in urls if path in u]
if limit:
urls = urls[:limit]