diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index d9095e49..e915ff5b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy): if el.tag in bypass_tags: continue + # Skip elements inside
 or  tags where whitespace is significant
+            # This preserves whitespace-only spans (e.g.,  ) in code blocks
+            is_in_code_block = False
+            ancestor = el.getparent()
+            while ancestor is not None:
+                if ancestor.tag in ("pre", "code"):
+                    is_in_code_block = True
+                    break
+                ancestor = ancestor.getparent()
+
+            if is_in_code_block:
+                continue
+
             text_content = (el.text_content() or "").strip()
             if (
                 len(text_content.split()) < word_count_threshold