fix:target_element should not affect link extraction. -> https://github.com/unclecode/crawl4ai/issues/902
This commit is contained in:
@@ -897,29 +897,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
for element in body.select(excluded_selector):
|
for element in body.select(excluded_selector):
|
||||||
element.extract()
|
element.extract()
|
||||||
|
|
||||||
# if False and css_selector:
|
|
||||||
# selected_elements = body.select(css_selector)
|
|
||||||
# if not selected_elements:
|
|
||||||
# return {
|
|
||||||
# "markdown": "",
|
|
||||||
# "cleaned_html": "",
|
|
||||||
# "success": True,
|
|
||||||
# "media": {"images": [], "videos": [], "audios": []},
|
|
||||||
# "links": {"internal": [], "external": []},
|
|
||||||
# "metadata": {},
|
|
||||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
|
||||||
# }
|
|
||||||
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
|
||||||
# body = soup.new_tag("div")
|
|
||||||
# for el in selected_elements:
|
|
||||||
# body.append(el)
|
|
||||||
|
|
||||||
content_element = None
|
content_element = None
|
||||||
if target_elements:
|
if target_elements:
|
||||||
try:
|
try:
|
||||||
for_content_targeted_element = []
|
for_content_targeted_element = []
|
||||||
for target_element in target_elements:
|
for target_element in target_elements:
|
||||||
for_content_targeted_element.extend(body.select(target_element))
|
# Creating a fresh parse of HTML for each selector to prevent element extraction
|
||||||
|
# from modifying the original DOM tree; this keeps the original body
|
||||||
|
# intact for link processing. This is better performant than deepcopy.
|
||||||
|
fresh_body = BeautifulSoup(html, "html.parser")
|
||||||
|
for_content_targeted_element.extend(fresh_body.select(target_element))
|
||||||
content_element = soup.new_tag("div")
|
content_element = soup.new_tag("div")
|
||||||
for el in for_content_targeted_element:
|
for el in for_content_targeted_element:
|
||||||
content_element.append(el)
|
content_element.append(el)
|
||||||
@@ -1536,34 +1523,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
||||||
meta = {}
|
meta = {}
|
||||||
|
|
||||||
# Handle CSS selector targeting
|
|
||||||
# if css_selector:
|
|
||||||
# try:
|
|
||||||
# selected_elements = body.cssselect(css_selector)
|
|
||||||
# if not selected_elements:
|
|
||||||
# return {
|
|
||||||
# "markdown": "",
|
|
||||||
# "cleaned_html": "",
|
|
||||||
# "success": True,
|
|
||||||
# "media": {"images": [], "videos": [], "audios": []},
|
|
||||||
# "links": {"internal": [], "external": []},
|
|
||||||
# "metadata": meta,
|
|
||||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
|
||||||
# }
|
|
||||||
# body = lhtml.Element("div")
|
|
||||||
# body.extend(selected_elements)
|
|
||||||
# except Exception as e:
|
|
||||||
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
|
|
||||||
# return None
|
|
||||||
|
|
||||||
content_element = None
|
content_element = None
|
||||||
if target_elements:
|
if target_elements:
|
||||||
try:
|
try:
|
||||||
for_content_targeted_element = []
|
|
||||||
for target_element in target_elements:
|
|
||||||
for_content_targeted_element.extend(body.cssselect(target_element))
|
|
||||||
content_element = lhtml.Element("div")
|
content_element = lhtml.Element("div")
|
||||||
content_element.extend(for_content_targeted_element)
|
for target_element in target_elements:
|
||||||
|
# Creating a fresh parse of HTML for each selector to prevent element extraction
|
||||||
|
# from modifying the original DOM tree; this keeps the original body
|
||||||
|
# intact for link processing. This is better performant than deepcopy.
|
||||||
|
fresh_body = lhtml.document_fromstring(html)
|
||||||
|
for_content_targeted_element = []
|
||||||
|
for target_element in target_elements:
|
||||||
|
for_content_targeted_element.extend(fresh_body.cssselect(target_element))
|
||||||
|
content_element = lhtml.Element("div")
|
||||||
|
content_element.extend(for_content_targeted_element)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user