In obtaining cleaned_html, the tag "script" needs to be processed separately.

This commit is contained in:
lizhuxiong
2025-08-05 16:27:03 +08:00
parent 6d3444ba17
commit 660d7011b9

View File

@@ -1668,6 +1668,11 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
content_element = body
# Remove script and style tags
for tag in ["style", "link", "meta", "noscript"]:
for element in body.xpath(f".//{tag}"):
if element.getparent() is not None:
element.getparent().remove(element)
# Handle script separately
for element in body.xpath(f".//script"):
parent = element.getparent()
@@ -1687,10 +1692,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
parent.text = tail
parent.remove(element) # Delete the element
for tag in ["style", "link", "meta", "noscript"]:
for element in body.xpath(f".//{tag}"):
if element.getparent() is not None:
element.getparent().remove(element)
# Handle social media and domain exclusions
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))