fix: replace dead sitemap with crawl-based URL discovery
The Odoo 18 sitemap.xml returns 404. The fallback URL list also failed because urljoin(BASE_URL, /applications/...) strips the /documentation/18.0 path (absolute path arg replaces the whole path component in urljoin). Changes: - Add discover_urls_by_crawl(): fetches each module index page and collects all internal links — replaces sitemap as primary source - crawl() now chains: sitemap → crawl discovery → hardcoded fallback - Fix fallback_urls() to use BASE_URL + path (not urljoin) and trim the list to known-good pages - Keep crawl discovery rate-limited (0.5s between module seeds) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -115,68 +115,79 @@ def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
|
||||
return urls
|
||||
|
||||
|
||||
def discover_urls_by_crawl(module_filter: str | None = None) -> list:
|
||||
"""
|
||||
Discover doc URLs by fetching each module's index page and collecting
|
||||
every internal link that stays within that module's path.
|
||||
This replaces the sitemap when it is unavailable.
|
||||
"""
|
||||
if module_filter:
|
||||
path = MODULE_PATHS.get(module_filter)
|
||||
if not path:
|
||||
raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}")
|
||||
module_paths = {module_filter: path}
|
||||
else:
|
||||
module_paths = MODULE_PATHS
|
||||
|
||||
found: set = set()
|
||||
|
||||
for module, path in module_paths.items():
|
||||
# Module index pages are at <path>.html or <path>/index.html
|
||||
seeds = [
|
||||
BASE_URL + path + ".html",
|
||||
BASE_URL + path + "/index.html",
|
||||
]
|
||||
for seed in seeds:
|
||||
try:
|
||||
resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
if resp.status_code != 200:
|
||||
continue
|
||||
found.add(seed)
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
prefix = BASE_URL + path
|
||||
for a in soup.find_all("a", href=True):
|
||||
abs_url = urljoin(seed, a["href"]).split("#")[0]
|
||||
if abs_url.startswith(prefix) and abs_url not in found:
|
||||
found.add(abs_url)
|
||||
log.info(f" {module}: {len([u for u in found if path in u])} URLs from {seed}")
|
||||
time.sleep(0.5)
|
||||
break # one successful seed per module is enough
|
||||
except Exception as e:
|
||||
log.debug(f" seed {seed} failed: {e}")
|
||||
|
||||
urls = sorted(found)
|
||||
log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules")
|
||||
return urls
|
||||
|
||||
|
||||
def fallback_urls() -> list:
|
||||
"""Curated fallback list if sitemap is unavailable."""
|
||||
"""Last-resort hardcoded list when both sitemap and crawl discovery fail."""
|
||||
paths = [
|
||||
"/applications/finance/accounting.html",
|
||||
"/applications/finance/accounting/customer_invoices.html",
|
||||
"/applications/finance/accounting/customer_invoices/overview.html",
|
||||
"/applications/finance/accounting/vendor_bills.html",
|
||||
"/applications/finance/accounting/get_started/chart_of_accounts.html",
|
||||
"/applications/finance/accounting/get_started/cheat_sheet.html",
|
||||
"/applications/finance/accounting/get_started/multi_currency.html",
|
||||
"/applications/finance/accounting/reporting/budget.html",
|
||||
"/applications/finance/accounting/reporting/analytic_accounting.html",
|
||||
"/applications/finance/accounting/bank.html",
|
||||
"/applications/finance/accounting/taxes.html",
|
||||
"/applications/finance/accounting/reporting.html",
|
||||
"/applications/finance/accounting/bank.html",
|
||||
"/applications/finance/expenses.html",
|
||||
"/applications/finance/expenses/reinvoice_expenses.html",
|
||||
"/applications/finance/payment_providers.html",
|
||||
"/applications/finance.html",
|
||||
"/applications/sales.html",
|
||||
"/applications/sales/sales.html",
|
||||
"/applications/sales/crm.html",
|
||||
"/applications/sales/crm/pipeline.html",
|
||||
"/applications/sales/crm/acquire_leads/email_manual.html",
|
||||
"/applications/sales/crm/pipeline/manage_sales_teams.html",
|
||||
"/applications/sales/crm/optimize/utilize_activities.html",
|
||||
"/applications/inventory_and_mrp/inventory.html",
|
||||
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
|
||||
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
|
||||
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
|
||||
"/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
|
||||
"/applications/inventory_and_mrp/purchase.html",
|
||||
"/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
|
||||
"/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
|
||||
"/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
|
||||
"/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
|
||||
"/applications/inventory_and_mrp/manufacturing.html",
|
||||
"/applications/inventory_and_mrp/manufacturing/workflows.html",
|
||||
"/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
|
||||
"/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
|
||||
"/applications/inventory_and_mrp/manufacturing/subcontracting.html",
|
||||
"/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
|
||||
"/applications/hr.html",
|
||||
"/applications/hr/employees.html",
|
||||
"/applications/hr/employees/new_employee.html",
|
||||
"/applications/hr/payroll.html",
|
||||
"/applications/hr/payroll/contracts.html",
|
||||
"/applications/hr/payroll/payslips.html",
|
||||
"/applications/hr/payroll/batches.html",
|
||||
"/applications/websites/ecommerce.html",
|
||||
"/applications/websites/ecommerce/products.html",
|
||||
"/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
|
||||
"/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
|
||||
"/applications/websites/ecommerce/customer_accounts.html",
|
||||
"/applications/services/helpdesk.html",
|
||||
"/applications/services/helpdesk/advanced/after_sales.html",
|
||||
"/applications/services/project.html",
|
||||
"/applications/finance/fiscal_localizations/united_states.html",
|
||||
"/applications.html",
|
||||
"/applications/general.html",
|
||||
]
|
||||
return [urljoin(BASE_URL, p) for p in paths]
|
||||
# NOTE: urljoin drops the /documentation/18.0 path when given an absolute
|
||||
# path arg — use string concatenation instead.
|
||||
return [BASE_URL + p for p in paths]
|
||||
|
||||
|
||||
def infer_module(url: str) -> str:
|
||||
@@ -286,12 +297,18 @@ def crawl(module: str | None = None, limit: int | None = None, debug: bool = Fal
|
||||
|
||||
try:
|
||||
urls = fetch_sitemap_urls(SITEMAP_URL, module)
|
||||
if not urls:
|
||||
raise ValueError("Sitemap returned 0 matching URLs")
|
||||
except Exception as e:
|
||||
log.warning(f"Sitemap unavailable ({e}), using fallback list")
|
||||
urls = fallback_urls()
|
||||
if module:
|
||||
path = MODULE_PATHS.get(module, "")
|
||||
urls = [u for u in urls if path.lstrip("/") in u]
|
||||
log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery")
|
||||
try:
|
||||
urls = discover_urls_by_crawl(module)
|
||||
except Exception as e2:
|
||||
log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list")
|
||||
urls = fallback_urls()
|
||||
if module:
|
||||
path = MODULE_PATHS.get(module, "")
|
||||
urls = [u for u in urls if path in u]
|
||||
|
||||
if limit:
|
||||
urls = urls[:limit]
|
||||
|
||||
Reference in New Issue
Block a user