From d84508b4d5dad7c3b8f9b772cedfdc08c89ab2a9 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 12 Apr 2025 12:05:17 +0530 Subject: [PATCH] fix: revert the old target_elms code in regular webscraping strategy --- crawl4ai/content_scraping_strategy.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 81fe9d4e..0a93352b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -908,11 +908,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: for_content_targeted_element = [] for target_element in target_elements: - # Creating a fresh parse of HTML for each selector to prevent element extraction - # from modifying the original DOM tree; this keeps the original body - # intact for link processing. This is better performant than deepcopy. - fresh_body = BeautifulSoup(html, "lxml") - for_content_targeted_element.extend(fresh_body.select(target_element)) + for_content_targeted_element.extend(body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: content_element.append(el) @@ -920,7 +916,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None else: - content_element = body + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS