From 57e0423b3a6ddb9147fce898a2e5c0afaaead90d Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 28 Mar 2025 12:56:37 +0530 Subject: [PATCH] fix:target_element should not affect link extraction. -> https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 59 ++++++++------------------- 1 file changed, 16 insertions(+), 43 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 0848d655..11835d62 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -897,29 +897,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): for element in body.select(excluded_selector): element.extract() - # if False and css_selector: - # selected_elements = body.select(css_selector) - # if not selected_elements: - # return { - # "markdown": "", - # "cleaned_html": "", - # "success": True, - # "media": {"images": [], "videos": [], "audios": []}, - # "links": {"internal": [], "external": []}, - # "metadata": {}, - # "message": f"No elements found for CSS selector: {css_selector}", - # } - # # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") - # body = soup.new_tag("div") - # for el in selected_elements: - # body.append(el) - content_element = None if target_elements: try: for_content_targeted_element = [] for target_element in target_elements: - for_content_targeted_element.extend(body.select(target_element)) + # Creating a fresh parse of HTML for each selector to prevent element extraction + # from modifying the original DOM tree; this keeps the original body + # intact for link processing. This is better performant than deepcopy. + fresh_body = BeautifulSoup(html, "html.parser") + for_content_targeted_element.extend(fresh_body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: content_element.append(el) @@ -927,7 +914,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None else: - content_element = body + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -1536,34 +1523,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} - # Handle CSS selector targeting - # if css_selector: - # try: - # selected_elements = body.cssselect(css_selector) - # if not selected_elements: - # return { - # "markdown": "", - # "cleaned_html": "", - # "success": True, - # "media": {"images": [], "videos": [], "audios": []}, - # "links": {"internal": [], "external": []}, - # "metadata": meta, - # "message": f"No elements found for CSS selector: {css_selector}", - # } - # body = lhtml.Element("div") - # body.extend(selected_elements) - # except Exception as e: - # self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") - # return None - content_element = None if target_elements: try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) + for target_element in target_elements: + # Creating a fresh parse of HTML for each selector to prevent element extraction + # from modifying the original DOM tree; this keeps the original body + # intact for link processing. This is better performant than deepcopy. + fresh_body = lhtml.document_fromstring(html) + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(fresh_body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None