diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index d9095e49..e915ff5b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy): if el.tag in bypass_tags: continue + # Skip elements inside
or tags where whitespace is significant
+ # This preserves whitespace-only spans (e.g., ) in code blocks
+ is_in_code_block = False
+ ancestor = el.getparent()
+ while ancestor is not None:
+ if ancestor.tag in ("pre", "code"):
+ is_in_code_block = True
+ break
+ ancestor = ancestor.getparent()
+
+ if is_in_code_block:
+ continue
+
text_content = (el.text_content() or "").strip()
if (
len(text_content.split()) < word_count_threshold