In obtaining cleaned_html, the tag "script" needs to be processed separately.
This commit is contained in:
@@ -1668,6 +1668,11 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
content_element = body
|
content_element = body
|
||||||
|
|
||||||
# Remove script and style tags
|
# Remove script and style tags
|
||||||
|
for tag in ["style", "link", "meta", "noscript"]:
|
||||||
|
for element in body.xpath(f".//{tag}"):
|
||||||
|
if element.getparent() is not None:
|
||||||
|
element.getparent().remove(element)
|
||||||
|
|
||||||
# Handle script separately
|
# Handle script separately
|
||||||
for element in body.xpath(f".//script"):
|
for element in body.xpath(f".//script"):
|
||||||
parent = element.getparent()
|
parent = element.getparent()
|
||||||
@@ -1687,10 +1692,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
parent.text = tail
|
parent.text = tail
|
||||||
parent.remove(element) # Delete the element
|
parent.remove(element) # Delete the element
|
||||||
|
|
||||||
for tag in ["style", "link", "meta", "noscript"]:
|
|
||||||
for element in body.xpath(f".//{tag}"):
|
|
||||||
if element.getparent() is not None:
|
|
||||||
element.getparent().remove(element)
|
|
||||||
|
|
||||||
# Handle social media and domain exclusions
|
# Handle social media and domain exclusions
|
||||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
||||||
|
|||||||
Reference in New Issue
Block a user