From bc054cd478c0520276184183f291d0348eddf9f2 Mon Sep 17 00:00:00 2001
From: Carlos Garcia <tocmo@DESKTOP-O6U0UOS.AVC.local>
Date: Thu, 14 May 2026 14:03:30 -0400
Subject: [PATCH] fix: strip Sphinx pilcrow artifacts from extracted text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sphinx generates headerlink anchors (the paragraph symbol ¶) next to
each heading. These appear as Â¶ in the output due to a UTF-8/Latin-1
decode mismatch in BeautifulSoup. Fix by removing .headerlink elements
in NOISE_SELECTORS and stripping any residual ¶/Â¶ in clean_text().

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scraper/scraper.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scraper/scraper.py b/scraper/scraper.py
index 52d6cb5..fff77b5 100644
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -78,6 +78,7 @@ NOISE_SELECTORS = [
     ".rst-footer-buttons", "#edit-on-github",
     "[role='navigation']", ".breadcrumbs",
     ".sidebar", ".sphinxsidebar",
+    ".headerlink",   # Sphinx ¶ permalink anchors
     "script", "style",
 ]
 
@@ -248,6 +249,8 @@ def clean_text(soup: BeautifulSoup) -> tuple:
 
     raw = "\n".join(lines)
     clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
+    # Strip residual Sphinx pilcrow characters (¶ and its mis-decoded form Â¶)
+    clean = re.sub(r"Â¶|¶", "", clean).strip()
     return clean, headings