In obtaining cleaned_html, the tag "script" needs to be processed separately.

This commit is contained in:
lizhuxiong
2025-08-05 16:18:34 +08:00
parent e3281935bc
commit 6d3444ba17

View File

@@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
content_element = body
# Remove script and style tags
for tag in ["script", "style", "link", "meta", "noscript"]:
# Handle script separately
for element in body.xpath(f".//script"):
parent = element.getparent()
if parent is not None:
tail = element.tail # Get the tail text
if tail:
prev = element.getprevious() # Get the previous sibling node
if prev is not None:
if prev.tail:
prev.tail += tail
else:
prev.tail = tail
else:
if parent.text:
parent.text += tail
else:
parent.text = tail
parent.remove(element) # Delete the element
for tag in ["style", "link", "meta", "noscript"]:
for element in body.xpath(f".//{tag}"):
if element.getparent() is not None:
element.getparent().remove(element)