Merge PR #1364: Fix script tag removal losing adjacent text in cleaned_html
This commit is contained in:
@@ -709,10 +709,30 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
content_element = body
|
content_element = body
|
||||||
|
|
||||||
# Remove script and style tags
|
# Remove script and style tags
|
||||||
for tag in ["script", "style", "link", "meta", "noscript"]:
|
for tag in ["style", "link", "meta", "noscript"]:
|
||||||
for element in body.xpath(f".//{tag}"):
|
for element in body.xpath(f".//{tag}"):
|
||||||
if element.getparent() is not None:
|
if element.getparent() is not None:
|
||||||
element.getparent().remove(element)
|
element.getparent().remove(element)
|
||||||
|
|
||||||
|
# Handle script separately
|
||||||
|
for element in body.xpath(f".//script"):
|
||||||
|
parent = element.getparent()
|
||||||
|
if parent is not None:
|
||||||
|
tail = element.tail # Get the tail text
|
||||||
|
if tail:
|
||||||
|
prev = element.getprevious() # Get the previous sibling node
|
||||||
|
if prev is not None:
|
||||||
|
if prev.tail:
|
||||||
|
prev.tail += tail
|
||||||
|
else:
|
||||||
|
prev.tail = tail
|
||||||
|
else:
|
||||||
|
if parent.text:
|
||||||
|
parent.text += tail
|
||||||
|
else:
|
||||||
|
parent.text = tail
|
||||||
|
parent.remove(element) # Delete the element
|
||||||
|
|
||||||
|
|
||||||
# Handle social media and domain exclusions
|
# Handle social media and domain exclusions
|
||||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
||||||
|
|||||||
Reference in New Issue
Block a user