reverse:last change in order of execution for it introduced a new issue in content generated. https://github.com/unclecode/crawl4ai/issues/902
This commit is contained in:
@@ -901,7 +901,22 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
element.extract()
|
element.extract()
|
||||||
else:
|
else:
|
||||||
for element in body.select(excluded_selector):
|
for element in body.select(excluded_selector):
|
||||||
element.extract()
|
element.extract()
|
||||||
|
|
||||||
|
content_element = None
|
||||||
|
if target_elements:
|
||||||
|
try:
|
||||||
|
for_content_targeted_element = []
|
||||||
|
for target_element in target_elements:
|
||||||
|
for_content_targeted_element.extend(body.select(target_element))
|
||||||
|
content_element = soup.new_tag("div")
|
||||||
|
for el in for_content_targeted_element:
|
||||||
|
content_element.append(el)
|
||||||
|
except Exception as e:
|
||||||
|
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
content_element = body
|
||||||
|
|
||||||
kwargs["exclude_social_media_domains"] = set(
|
kwargs["exclude_social_media_domains"] = set(
|
||||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||||
@@ -961,20 +976,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
str_body = ""
|
str_body = ""
|
||||||
try:
|
try:
|
||||||
content_element = None
|
|
||||||
if target_elements:
|
|
||||||
try:
|
|
||||||
for_content_targeted_element = []
|
|
||||||
for target_element in target_elements:
|
|
||||||
for_content_targeted_element.extend(body.select(target_element))
|
|
||||||
content_element = soup.new_tag("div")
|
|
||||||
for el in for_content_targeted_element:
|
|
||||||
content_element.append(el)
|
|
||||||
except Exception as e:
|
|
||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
content_element = body
|
|
||||||
str_body = content_element.encode_contents().decode("utf-8")
|
str_body = content_element.encode_contents().decode("utf-8")
|
||||||
except Exception:
|
except Exception:
|
||||||
# Reset body to the original HTML
|
# Reset body to the original HTML
|
||||||
@@ -1531,6 +1532,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
||||||
meta = {}
|
meta = {}
|
||||||
|
|
||||||
|
content_element = None
|
||||||
|
if target_elements:
|
||||||
|
try:
|
||||||
|
for_content_targeted_element = []
|
||||||
|
for target_element in target_elements:
|
||||||
|
for_content_targeted_element.extend(body.cssselect(target_element))
|
||||||
|
content_element = lhtml.Element("div")
|
||||||
|
content_element.extend(for_content_targeted_element)
|
||||||
|
except Exception as e:
|
||||||
|
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
content_element = body
|
||||||
|
|
||||||
# Remove script and style tags
|
# Remove script and style tags
|
||||||
for tag in ["script", "style", "link", "meta", "noscript"]:
|
for tag in ["script", "style", "link", "meta", "noscript"]:
|
||||||
for element in body.xpath(f".//{tag}"):
|
for element in body.xpath(f".//{tag}"):
|
||||||
@@ -1599,19 +1614,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Generate output HTML
|
# Generate output HTML
|
||||||
content_element = None
|
|
||||||
if target_elements:
|
|
||||||
try:
|
|
||||||
for_content_targeted_element = []
|
|
||||||
for target_element in target_elements:
|
|
||||||
for_content_targeted_element.extend(body.cssselect(target_element))
|
|
||||||
content_element = lhtml.Element("div")
|
|
||||||
content_element.extend(for_content_targeted_element)
|
|
||||||
except Exception as e:
|
|
||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
content_element = body
|
|
||||||
cleaned_html = lhtml.tostring(
|
cleaned_html = lhtml.tostring(
|
||||||
# body,
|
# body,
|
||||||
content_element,
|
content_element,
|
||||||
|
|||||||
Reference in New Issue
Block a user