fix: replace dead sitemap with crawl-based URL discovery

The Odoo 18 sitemap.xml returns 404. The fallback URL list also failed because urljoin(BASE_URL, /applications/...) strips the /documentation/18.0 path (absolute path arg replaces the whole path component in urljoin). Changes: - Add discover_urls_by_crawl(): fetches each module index page and collects all internal links — replaces sitemap as primary source - crawl() now chains: sitemap → crawl discovery → hardcoded fallback - Fix fallback_urls() to use BASE_URL + path (not urljoin) and trim the list to known-good pages - Keep crawl discovery rate-limited (0.5s between module seeds) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 13:05:40 -04:00
parent 3d94c4eb25
commit 608bb51943
1 changed files with 61 additions and 44 deletions
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -115,68 +115,79 @@ def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
    return urls


+def discover_urls_by_crawl(module_filter: str | None = None) -> list:
+    """
+    Discover doc URLs by fetching each module's index page and collecting
+    every internal link that stays within that module's path.
+    This replaces the sitemap when it is unavailable.
+    """
+    if module_filter:
+        path = MODULE_PATHS.get(module_filter)
+        if not path:
+            raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}")
+        module_paths = {module_filter: path}
+    else:
+        module_paths = MODULE_PATHS
+
+    found: set = set()
+
+    for module, path in module_paths.items():
+        # Module index pages are at <path>.html or <path>/index.html
+        seeds = [
+            BASE_URL + path + ".html",
+            BASE_URL + path + "/index.html",
+        ]
+        for seed in seeds:
+            try:
+                resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT)
+                if resp.status_code != 200:
+                    continue
+                found.add(seed)
+                soup = BeautifulSoup(resp.text, "html.parser")
+                prefix = BASE_URL + path
+                for a in soup.find_all("a", href=True):
+                    abs_url = urljoin(seed, a["href"]).split("#")[0]
+                    if abs_url.startswith(prefix) and abs_url not in found:
+                        found.add(abs_url)
+                log.info(f"  {module}: {len([u for u in found if path in u])} URLs from {seed}")
+                time.sleep(0.5)
+                break  # one successful seed per module is enough
+            except Exception as e:
+                log.debug(f"  seed {seed} failed: {e}")
+
+    urls = sorted(found)
+    log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules")
+    return urls
+
+
 def fallback_urls() -> list:
-    """Curated fallback list if sitemap is unavailable."""
+    """Last-resort hardcoded list when both sitemap and crawl discovery fail."""
    paths = [
        "/applications/finance/accounting.html",
        "/applications/finance/accounting/customer_invoices.html",
-        "/applications/finance/accounting/customer_invoices/overview.html",
        "/applications/finance/accounting/vendor_bills.html",
        "/applications/finance/accounting/get_started/chart_of_accounts.html",
-        "/applications/finance/accounting/get_started/cheat_sheet.html",
-        "/applications/finance/accounting/get_started/multi_currency.html",
-        "/applications/finance/accounting/reporting/budget.html",
-        "/applications/finance/accounting/reporting/analytic_accounting.html",
-        "/applications/finance/accounting/bank.html",
        "/applications/finance/accounting/taxes.html",
        "/applications/finance/accounting/reporting.html",
+        "/applications/finance/accounting/bank.html",
        "/applications/finance/expenses.html",
-        "/applications/finance/expenses/reinvoice_expenses.html",
        "/applications/finance/payment_providers.html",
-        "/applications/finance.html",
-        "/applications/sales.html",
        "/applications/sales/sales.html",
        "/applications/sales/crm.html",
        "/applications/sales/crm/pipeline.html",
-        "/applications/sales/crm/acquire_leads/email_manual.html",
-        "/applications/sales/crm/pipeline/manage_sales_teams.html",
-        "/applications/sales/crm/optimize/utilize_activities.html",
        "/applications/inventory_and_mrp/inventory.html",
-        "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
-        "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
-        "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
-        "/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
        "/applications/inventory_and_mrp/purchase.html",
-        "/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
-        "/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
-        "/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
-        "/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
        "/applications/inventory_and_mrp/manufacturing.html",
-        "/applications/inventory_and_mrp/manufacturing/workflows.html",
-        "/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
-        "/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
-        "/applications/inventory_and_mrp/manufacturing/subcontracting.html",
-        "/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
-        "/applications/hr.html",
        "/applications/hr/employees.html",
-        "/applications/hr/employees/new_employee.html",
        "/applications/hr/payroll.html",
-        "/applications/hr/payroll/contracts.html",
        "/applications/hr/payroll/payslips.html",
-        "/applications/hr/payroll/batches.html",
        "/applications/websites/ecommerce.html",
-        "/applications/websites/ecommerce/products.html",
-        "/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
-        "/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
-        "/applications/websites/ecommerce/customer_accounts.html",
        "/applications/services/helpdesk.html",
-        "/applications/services/helpdesk/advanced/after_sales.html",
        "/applications/services/project.html",
-        "/applications/finance/fiscal_localizations/united_states.html",
-        "/applications.html",
-        "/applications/general.html",
    ]
-    return [urljoin(BASE_URL, p) for p in paths]
+    # NOTE: urljoin drops the /documentation/18.0 path when given an absolute
+    # path arg — use string concatenation instead.
+    return [BASE_URL + p for p in paths]


 def infer_module(url: str) -> str:
@@ -286,12 +297,18 @@ def crawl(module: str | None = None, limit: int | None = None, debug: bool = Fal

    try:
        urls = fetch_sitemap_urls(SITEMAP_URL, module)
+        if not urls:
+            raise ValueError("Sitemap returned 0 matching URLs")
    except Exception as e:
-        log.warning(f"Sitemap unavailable ({e}), using fallback list")
-        urls = fallback_urls()
-        if module:
-            path = MODULE_PATHS.get(module, "")
-            urls = [u for u in urls if path.lstrip("/") in u]
+        log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery")
+        try:
+            urls = discover_urls_by_crawl(module)
+        except Exception as e2:
+            log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list")
+            urls = fallback_urls()
+            if module:
+                path = MODULE_PATHS.get(module, "")
+                urls = [u for u in urls if path in u]

    if limit:
        urls = urls[:limit]