fix: revert the old target_elms code in regular webscraping strategy
This commit is contained in:
@@ -908,11 +908,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
try:
|
try:
|
||||||
for_content_targeted_element = []
|
for_content_targeted_element = []
|
||||||
for target_element in target_elements:
|
for target_element in target_elements:
|
||||||
# Creating a fresh parse of HTML for each selector to prevent element extraction
|
for_content_targeted_element.extend(body.select(target_element))
|
||||||
# from modifying the original DOM tree; this keeps the original body
|
|
||||||
# intact for link processing. This is better performant than deepcopy.
|
|
||||||
fresh_body = BeautifulSoup(html, "lxml")
|
|
||||||
for_content_targeted_element.extend(fresh_body.select(target_element))
|
|
||||||
content_element = soup.new_tag("div")
|
content_element = soup.new_tag("div")
|
||||||
for el in for_content_targeted_element:
|
for el in for_content_targeted_element:
|
||||||
content_element.append(el)
|
content_element.append(el)
|
||||||
@@ -920,7 +916,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
content_element = body
|
content_element = body
|
||||||
|
|
||||||
kwargs["exclude_social_media_domains"] = set(
|
kwargs["exclude_social_media_domains"] = set(
|
||||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||||
|
|||||||
Reference in New Issue
Block a user