Fix #1181: Preserve whitespace in code blocks during HTML scraping
The remove_empty_elements_fast() method was removing whitespace-only span elements inside <pre> and <code> tags, causing import statements like "import torch" to become "importtorch". Now skips elements inside code blocks where whitespace is significant.
This commit is contained in:
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
if el.tag in bypass_tags:
|
if el.tag in bypass_tags:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||||
|
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||||
|
is_in_code_block = False
|
||||||
|
ancestor = el.getparent()
|
||||||
|
while ancestor is not None:
|
||||||
|
if ancestor.tag in ("pre", "code"):
|
||||||
|
is_in_code_block = True
|
||||||
|
break
|
||||||
|
ancestor = ancestor.getparent()
|
||||||
|
|
||||||
|
if is_in_code_block:
|
||||||
|
continue
|
||||||
|
|
||||||
text_content = (el.text_content() or "").strip()
|
text_content = (el.text_content() or "").strip()
|
||||||
if (
|
if (
|
||||||
len(text_content.split()) < word_count_threshold
|
len(text_content.split()) < word_count_threshold
|
||||||
|
|||||||
Reference in New Issue
Block a user