From bc054cd478c0520276184183f291d0348eddf9f2 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Date: Thu, 14 May 2026 14:03:30 -0400 Subject: [PATCH] fix: strip Sphinx pilcrow artifacts from extracted text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sphinx generates headerlink anchors (the paragraph symbol ¶) next to each heading. These appear as ¶ in the output due to a UTF-8/Latin-1 decode mismatch in BeautifulSoup. Fix by removing .headerlink elements in NOISE_SELECTORS and stripping any residual ¶/¶ in clean_text(). Co-Authored-By: Claude Sonnet 4.6 --- scraper/scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scraper/scraper.py b/scraper/scraper.py index 52d6cb5..fff77b5 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -78,6 +78,7 @@ NOISE_SELECTORS = [ ".rst-footer-buttons", "#edit-on-github", "[role='navigation']", ".breadcrumbs", ".sidebar", ".sphinxsidebar", + ".headerlink", # Sphinx ¶ permalink anchors "script", "style", ] @@ -248,6 +249,8 @@ def clean_text(soup: BeautifulSoup) -> tuple: raw = "\n".join(lines) clean = re.sub(r"\n{3,}", "\n\n", raw).strip() + # Strip residual Sphinx pilcrow characters (¶ and its mis-decoded form ¶) + clean = re.sub(r"¶|¶", "", clean).strip() return clean, headings