fix: strip Sphinx pilcrow artifacts from extracted text

Sphinx generates headerlink anchors (the paragraph symbol ¶) next to each heading. These appear as Â¶ in the output due to a UTF-8/Latin-1 decode mismatch in BeautifulSoup. Fix by removing .headerlink elements in NOISE_SELECTORS and stripping any residual ¶/Â¶ in clean_text(). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 14:03:30 -04:00
parent 608bb51943
commit bc054cd478
1 changed files with 3 additions and 0 deletions
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -78,6 +78,7 @@ NOISE_SELECTORS = [
    ".rst-footer-buttons", "#edit-on-github",
    "[role='navigation']", ".breadcrumbs",
    ".sidebar", ".sphinxsidebar",
    ".headerlink",   # Sphinx ¶ permalink anchors
    "script", "style",
 ]
@@ -248,6 +249,8 @@ def clean_text(soup: BeautifulSoup) -> tuple:
    raw = "\n".join(lines)
    clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
    # Strip residual Sphinx pilcrow characters (¶ and its mis-decoded form Â¶)
    clean = re.sub(r"Â¶|¶", "", clean).strip()
    return clean, headings