fix: replace dead sitemap with crawl-based URL discovery
The Odoo 18 sitemap.xml returns 404. The fallback URL list also failed because urljoin(BASE_URL, /applications/...) strips the /documentation/18.0 path (absolute path arg replaces the whole path component in urljoin). Changes: - Add discover_urls_by_crawl(): fetches each module index page and collects all internal links — replaces sitemap as primary source - crawl() now chains: sitemap → crawl discovery → hardcoded fallback - Fix fallback_urls() to use BASE_URL + path (not urljoin) and trim the list to known-good pages - Keep crawl discovery rate-limited (0.5s between module seeds) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -115,68 +115,79 @@ def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def discover_urls_by_crawl(module_filter: str | None = None) -> list:
|
||||||
|
"""
|
||||||
|
Discover doc URLs by fetching each module's index page and collecting
|
||||||
|
every internal link that stays within that module's path.
|
||||||
|
This replaces the sitemap when it is unavailable.
|
||||||
|
"""
|
||||||
|
if module_filter:
|
||||||
|
path = MODULE_PATHS.get(module_filter)
|
||||||
|
if not path:
|
||||||
|
raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}")
|
||||||
|
module_paths = {module_filter: path}
|
||||||
|
else:
|
||||||
|
module_paths = MODULE_PATHS
|
||||||
|
|
||||||
|
found: set = set()
|
||||||
|
|
||||||
|
for module, path in module_paths.items():
|
||||||
|
# Module index pages are at <path>.html or <path>/index.html
|
||||||
|
seeds = [
|
||||||
|
BASE_URL + path + ".html",
|
||||||
|
BASE_URL + path + "/index.html",
|
||||||
|
]
|
||||||
|
for seed in seeds:
|
||||||
|
try:
|
||||||
|
resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
continue
|
||||||
|
found.add(seed)
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
prefix = BASE_URL + path
|
||||||
|
for a in soup.find_all("a", href=True):
|
||||||
|
abs_url = urljoin(seed, a["href"]).split("#")[0]
|
||||||
|
if abs_url.startswith(prefix) and abs_url not in found:
|
||||||
|
found.add(abs_url)
|
||||||
|
log.info(f" {module}: {len([u for u in found if path in u])} URLs from {seed}")
|
||||||
|
time.sleep(0.5)
|
||||||
|
break # one successful seed per module is enough
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f" seed {seed} failed: {e}")
|
||||||
|
|
||||||
|
urls = sorted(found)
|
||||||
|
log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules")
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def fallback_urls() -> list:
|
def fallback_urls() -> list:
|
||||||
"""Curated fallback list if sitemap is unavailable."""
|
"""Last-resort hardcoded list when both sitemap and crawl discovery fail."""
|
||||||
paths = [
|
paths = [
|
||||||
"/applications/finance/accounting.html",
|
"/applications/finance/accounting.html",
|
||||||
"/applications/finance/accounting/customer_invoices.html",
|
"/applications/finance/accounting/customer_invoices.html",
|
||||||
"/applications/finance/accounting/customer_invoices/overview.html",
|
|
||||||
"/applications/finance/accounting/vendor_bills.html",
|
"/applications/finance/accounting/vendor_bills.html",
|
||||||
"/applications/finance/accounting/get_started/chart_of_accounts.html",
|
"/applications/finance/accounting/get_started/chart_of_accounts.html",
|
||||||
"/applications/finance/accounting/get_started/cheat_sheet.html",
|
|
||||||
"/applications/finance/accounting/get_started/multi_currency.html",
|
|
||||||
"/applications/finance/accounting/reporting/budget.html",
|
|
||||||
"/applications/finance/accounting/reporting/analytic_accounting.html",
|
|
||||||
"/applications/finance/accounting/bank.html",
|
|
||||||
"/applications/finance/accounting/taxes.html",
|
"/applications/finance/accounting/taxes.html",
|
||||||
"/applications/finance/accounting/reporting.html",
|
"/applications/finance/accounting/reporting.html",
|
||||||
|
"/applications/finance/accounting/bank.html",
|
||||||
"/applications/finance/expenses.html",
|
"/applications/finance/expenses.html",
|
||||||
"/applications/finance/expenses/reinvoice_expenses.html",
|
|
||||||
"/applications/finance/payment_providers.html",
|
"/applications/finance/payment_providers.html",
|
||||||
"/applications/finance.html",
|
|
||||||
"/applications/sales.html",
|
|
||||||
"/applications/sales/sales.html",
|
"/applications/sales/sales.html",
|
||||||
"/applications/sales/crm.html",
|
"/applications/sales/crm.html",
|
||||||
"/applications/sales/crm/pipeline.html",
|
"/applications/sales/crm/pipeline.html",
|
||||||
"/applications/sales/crm/acquire_leads/email_manual.html",
|
|
||||||
"/applications/sales/crm/pipeline/manage_sales_teams.html",
|
|
||||||
"/applications/sales/crm/optimize/utilize_activities.html",
|
|
||||||
"/applications/inventory_and_mrp/inventory.html",
|
"/applications/inventory_and_mrp/inventory.html",
|
||||||
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
|
|
||||||
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
|
|
||||||
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
|
|
||||||
"/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
|
|
||||||
"/applications/inventory_and_mrp/purchase.html",
|
"/applications/inventory_and_mrp/purchase.html",
|
||||||
"/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
|
|
||||||
"/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
|
|
||||||
"/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
|
|
||||||
"/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
|
|
||||||
"/applications/inventory_and_mrp/manufacturing.html",
|
"/applications/inventory_and_mrp/manufacturing.html",
|
||||||
"/applications/inventory_and_mrp/manufacturing/workflows.html",
|
|
||||||
"/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
|
|
||||||
"/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
|
|
||||||
"/applications/inventory_and_mrp/manufacturing/subcontracting.html",
|
|
||||||
"/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
|
|
||||||
"/applications/hr.html",
|
|
||||||
"/applications/hr/employees.html",
|
"/applications/hr/employees.html",
|
||||||
"/applications/hr/employees/new_employee.html",
|
|
||||||
"/applications/hr/payroll.html",
|
"/applications/hr/payroll.html",
|
||||||
"/applications/hr/payroll/contracts.html",
|
|
||||||
"/applications/hr/payroll/payslips.html",
|
"/applications/hr/payroll/payslips.html",
|
||||||
"/applications/hr/payroll/batches.html",
|
|
||||||
"/applications/websites/ecommerce.html",
|
"/applications/websites/ecommerce.html",
|
||||||
"/applications/websites/ecommerce/products.html",
|
|
||||||
"/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
|
|
||||||
"/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
|
|
||||||
"/applications/websites/ecommerce/customer_accounts.html",
|
|
||||||
"/applications/services/helpdesk.html",
|
"/applications/services/helpdesk.html",
|
||||||
"/applications/services/helpdesk/advanced/after_sales.html",
|
|
||||||
"/applications/services/project.html",
|
"/applications/services/project.html",
|
||||||
"/applications/finance/fiscal_localizations/united_states.html",
|
|
||||||
"/applications.html",
|
|
||||||
"/applications/general.html",
|
|
||||||
]
|
]
|
||||||
return [urljoin(BASE_URL, p) for p in paths]
|
# NOTE: urljoin drops the /documentation/18.0 path when given an absolute
|
||||||
|
# path arg — use string concatenation instead.
|
||||||
|
return [BASE_URL + p for p in paths]
|
||||||
|
|
||||||
|
|
||||||
def infer_module(url: str) -> str:
|
def infer_module(url: str) -> str:
|
||||||
@@ -286,12 +297,18 @@ def crawl(module: str | None = None, limit: int | None = None, debug: bool = Fal
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
urls = fetch_sitemap_urls(SITEMAP_URL, module)
|
urls = fetch_sitemap_urls(SITEMAP_URL, module)
|
||||||
|
if not urls:
|
||||||
|
raise ValueError("Sitemap returned 0 matching URLs")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"Sitemap unavailable ({e}), using fallback list")
|
log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery")
|
||||||
|
try:
|
||||||
|
urls = discover_urls_by_crawl(module)
|
||||||
|
except Exception as e2:
|
||||||
|
log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list")
|
||||||
urls = fallback_urls()
|
urls = fallback_urls()
|
||||||
if module:
|
if module:
|
||||||
path = MODULE_PATHS.get(module, "")
|
path = MODULE_PATHS.get(module, "")
|
||||||
urls = [u for u in urls if path.lstrip("/") in u]
|
urls = [u for u in urls if path in u]
|
||||||
|
|
||||||
if limit:
|
if limit:
|
||||||
urls = urls[:limit]
|
urls = urls[:limit]
|
||||||
|
|||||||
Reference in New Issue
Block a user