diff --git a/scraper/scraper.py b/scraper/scraper.py index 52d6cb5..fff77b5 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -78,6 +78,7 @@ NOISE_SELECTORS = [ ".rst-footer-buttons", "#edit-on-github", "[role='navigation']", ".breadcrumbs", ".sidebar", ".sphinxsidebar", + ".headerlink", # Sphinx ¶ permalink anchors "script", "style", ] @@ -248,6 +249,8 @@ def clean_text(soup: BeautifulSoup) -> tuple: raw = "\n".join(lines) clean = re.sub(r"\n{3,}", "\n\n", raw).strip() + # Strip residual Sphinx pilcrow characters (¶ and its mis-decoded form ¶) + clean = re.sub(r"¶|¶", "", clean).strip() return clean, headings