diff --git a/scraper/scraper.py b/scraper/scraper.py index d077538..52d6cb5 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -115,68 +115,79 @@ def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list: return urls +def discover_urls_by_crawl(module_filter: str | None = None) -> list: + """ + Discover doc URLs by fetching each module's index page and collecting + every internal link that stays within that module's path. + This replaces the sitemap when it is unavailable. + """ + if module_filter: + path = MODULE_PATHS.get(module_filter) + if not path: + raise ValueError(f"Unknown module '{module_filter}'. Choices: {', '.join(MODULE_PATHS)}") + module_paths = {module_filter: path} + else: + module_paths = MODULE_PATHS + + found: set = set() + + for module, path in module_paths.items(): + # Module index pages are at .html or /index.html + seeds = [ + BASE_URL + path + ".html", + BASE_URL + path + "/index.html", + ] + for seed in seeds: + try: + resp = requests.get(seed, headers=HEADERS, timeout=REQUEST_TIMEOUT) + if resp.status_code != 200: + continue + found.add(seed) + soup = BeautifulSoup(resp.text, "html.parser") + prefix = BASE_URL + path + for a in soup.find_all("a", href=True): + abs_url = urljoin(seed, a["href"]).split("#")[0] + if abs_url.startswith(prefix) and abs_url not in found: + found.add(abs_url) + log.info(f" {module}: {len([u for u in found if path in u])} URLs from {seed}") + time.sleep(0.5) + break # one successful seed per module is enough + except Exception as e: + log.debug(f" seed {seed} failed: {e}") + + urls = sorted(found) + log.info(f"Crawl discovery: {len(urls)} total URLs across {len(module_paths)} modules") + return urls + + def fallback_urls() -> list: - """Curated fallback list if sitemap is unavailable.""" + """Last-resort hardcoded list when both sitemap and crawl discovery fail.""" paths = [ "/applications/finance/accounting.html", "/applications/finance/accounting/customer_invoices.html", - "/applications/finance/accounting/customer_invoices/overview.html", "/applications/finance/accounting/vendor_bills.html", "/applications/finance/accounting/get_started/chart_of_accounts.html", - "/applications/finance/accounting/get_started/cheat_sheet.html", - "/applications/finance/accounting/get_started/multi_currency.html", - "/applications/finance/accounting/reporting/budget.html", - "/applications/finance/accounting/reporting/analytic_accounting.html", - "/applications/finance/accounting/bank.html", "/applications/finance/accounting/taxes.html", "/applications/finance/accounting/reporting.html", + "/applications/finance/accounting/bank.html", "/applications/finance/expenses.html", - "/applications/finance/expenses/reinvoice_expenses.html", "/applications/finance/payment_providers.html", - "/applications/finance.html", - "/applications/sales.html", "/applications/sales/sales.html", "/applications/sales/crm.html", "/applications/sales/crm/pipeline.html", - "/applications/sales/crm/acquire_leads/email_manual.html", - "/applications/sales/crm/pipeline/manage_sales_teams.html", - "/applications/sales/crm/optimize/utilize_activities.html", "/applications/inventory_and_mrp/inventory.html", - "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html", - "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html", - "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html", - "/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html", "/applications/inventory_and_mrp/purchase.html", - "/applications/inventory_and_mrp/purchase/manage_deals/rfq.html", - "/applications/inventory_and_mrp/purchase/manage_deals/manage.html", - "/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html", - "/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html", "/applications/inventory_and_mrp/manufacturing.html", - "/applications/inventory_and_mrp/manufacturing/workflows.html", - "/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html", - "/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html", - "/applications/inventory_and_mrp/manufacturing/subcontracting.html", - "/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html", - "/applications/hr.html", "/applications/hr/employees.html", - "/applications/hr/employees/new_employee.html", "/applications/hr/payroll.html", - "/applications/hr/payroll/contracts.html", "/applications/hr/payroll/payslips.html", - "/applications/hr/payroll/batches.html", "/applications/websites/ecommerce.html", - "/applications/websites/ecommerce/products.html", - "/applications/websites/ecommerce/checkout_payment_shipping/checkout.html", - "/applications/websites/ecommerce/checkout_payment_shipping/payments.html", - "/applications/websites/ecommerce/customer_accounts.html", "/applications/services/helpdesk.html", - "/applications/services/helpdesk/advanced/after_sales.html", "/applications/services/project.html", - "/applications/finance/fiscal_localizations/united_states.html", - "/applications.html", - "/applications/general.html", ] - return [urljoin(BASE_URL, p) for p in paths] + # NOTE: urljoin drops the /documentation/18.0 path when given an absolute + # path arg — use string concatenation instead. + return [BASE_URL + p for p in paths] def infer_module(url: str) -> str: @@ -286,12 +297,18 @@ def crawl(module: str | None = None, limit: int | None = None, debug: bool = Fal try: urls = fetch_sitemap_urls(SITEMAP_URL, module) + if not urls: + raise ValueError("Sitemap returned 0 matching URLs") except Exception as e: - log.warning(f"Sitemap unavailable ({e}), using fallback list") - urls = fallback_urls() - if module: - path = MODULE_PATHS.get(module, "") - urls = [u for u in urls if path.lstrip("/") in u] + log.warning(f"Sitemap unavailable ({e}), switching to crawl discovery") + try: + urls = discover_urls_by_crawl(module) + except Exception as e2: + log.warning(f"Crawl discovery failed ({e2}), using hardcoded fallback list") + urls = fallback_urls() + if module: + path = MODULE_PATHS.get(module, "") + urls = [u for u in urls if path in u] if limit: urls = urls[:limit]