In obtaining cleaned_html, the tag "script" needs to be processed separately.
This commit is contained in:
@@ -1668,7 +1668,26 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
content_element = body
|
content_element = body
|
||||||
|
|
||||||
# Remove script and style tags
|
# Remove script and style tags
|
||||||
for tag in ["script", "style", "link", "meta", "noscript"]:
|
# Handle script separately
|
||||||
|
for element in body.xpath(f".//script"):
|
||||||
|
parent = element.getparent()
|
||||||
|
if parent is not None:
|
||||||
|
tail = element.tail # Get the tail text
|
||||||
|
if tail:
|
||||||
|
prev = element.getprevious() # Get the previous sibling node
|
||||||
|
if prev is not None:
|
||||||
|
if prev.tail:
|
||||||
|
prev.tail += tail
|
||||||
|
else:
|
||||||
|
prev.tail = tail
|
||||||
|
else:
|
||||||
|
if parent.text:
|
||||||
|
parent.text += tail
|
||||||
|
else:
|
||||||
|
parent.text = tail
|
||||||
|
parent.remove(element) # Delete the element
|
||||||
|
|
||||||
|
for tag in ["style", "link", "meta", "noscript"]:
|
||||||
for element in body.xpath(f".//{tag}"):
|
for element in body.xpath(f".//{tag}"):
|
||||||
if element.getparent() is not None:
|
if element.getparent() is not None:
|
||||||
element.getparent().remove(element)
|
element.getparent().remove(element)
|
||||||
|
|||||||
Reference in New Issue
Block a user