fix:target_element should not affect link extraction. -> https://github.com/unclecode/crawl4ai/issues/902

This commit is contained in:
Aravind Karnam
2025-03-28 12:56:37 +05:30
parent 7be5427283
commit 57e0423b3a

View File

@@ -897,29 +897,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for element in body.select(excluded_selector):
element.extract()
# if False and css_selector:
# selected_elements = body.select(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": {},
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
# body = soup.new_tag("div")
# for el in selected_elements:
# body.append(el)
content_element = None
if target_elements:
try:
for_content_targeted_element = []
for target_element in target_elements:
for_content_targeted_element.extend(body.select(target_element))
# Creating a fresh parse of HTML for each selector to prevent element extraction
# from modifying the original DOM tree; this keeps the original body
# intact for link processing. This is better performant than deepcopy.
fresh_body = BeautifulSoup(html, "html.parser")
for_content_targeted_element.extend(fresh_body.select(target_element))
content_element = soup.new_tag("div")
for el in for_content_targeted_element:
content_element.append(el)
@@ -927,7 +914,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None
else:
content_element = body
content_element = body
kwargs["exclude_social_media_domains"] = set(
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -1536,34 +1523,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
meta = {}
# Handle CSS selector targeting
# if css_selector:
# try:
# selected_elements = body.cssselect(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": meta,
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# body = lhtml.Element("div")
# body.extend(selected_elements)
# except Exception as e:
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
# return None
content_element = None
if target_elements:
try:
for_content_targeted_element = []
for target_element in target_elements:
for_content_targeted_element.extend(body.cssselect(target_element))
content_element = lhtml.Element("div")
content_element.extend(for_content_targeted_element)
for target_element in target_elements:
# Creating a fresh parse of HTML for each selector to prevent element extraction
# from modifying the original DOM tree; this keeps the original body
# intact for link processing. This is better performant than deepcopy.
fresh_body = lhtml.document_fromstring(html)
for_content_targeted_element = []
for target_element in target_elements:
for_content_targeted_element.extend(fresh_body.cssselect(target_element))
content_element = lhtml.Element("div")
content_element.extend(for_content_targeted_element)
except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None