From c2c4d42be4b87893251e46efb1cee6ea9471eff6 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Mon, 17 Nov 2025 12:21:23 +0100
Subject: [PATCH] Fix #1181: Preserve whitespace in code blocks during HTML
 scraping

  The remove_empty_elements_fast() method was removing whitespace-only
  span elements inside <pre> and <code> tags, causing import statements
  like "import torch" to become "importtorch". Now skips elements inside
  code blocks where whitespace is significant.
---
 crawl4ai/content_scraping_strategy.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index d9095e49..e915ff5b 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
             if el.tag in bypass_tags:
                 continue
 
+            # Skip elements inside <pre> or <code> tags where whitespace is significant
+            # This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
+            is_in_code_block = False
+            ancestor = el.getparent()
+            while ancestor is not None:
+                if ancestor.tag in ("pre", "code"):
+                    is_in_code_block = True
+                    break
+                ancestor = ancestor.getparent()
+
+            if is_in_code_block:
+                continue
+
             text_content = (el.text_content() or "").strip()
             if (
                 len(text_content.split()) < word_count_threshold