fix: revert the old target_elms code in LXMLwebscraping strategy

This commit is contained in:
Aravind Karnam
2025-04-12 12:07:04 +05:30
parent d84508b4d5
commit 9fc5d315af

View File

@@ -1535,15 +1535,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
content_element = None content_element = None
if target_elements: if target_elements:
try: try:
content_element = lhtml.Element("div")
for target_element in target_elements:
# Creating a fresh parse of HTML for each selector to prevent element extraction
# from modifying the original DOM tree; this keeps the original body
# intact for link processing. This is better performant than deepcopy.
fresh_body = lhtml.document_fromstring(html)
for_content_targeted_element = [] for_content_targeted_element = []
for target_element in target_elements: for target_element in target_elements:
for_content_targeted_element.extend(fresh_body.cssselect(target_element)) for_content_targeted_element.extend(body.cssselect(target_element))
content_element = lhtml.Element("div") content_element = lhtml.Element("div")
content_element.extend(for_content_targeted_element) content_element.extend(for_content_targeted_element)
except Exception as e: except Exception as e: