From 7d8e81fb2e04b4c0844b37491664b05f65441567 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 12 Apr 2025 12:44:00 +0530 Subject: [PATCH] fix: fix target_elements, in a less invasive and more efficient way simply by changing order of execution :) https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 58 +++++++++++++-------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 814e4b2b..aa69c5fb 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -901,22 +901,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): element.extract() else: for element in body.select(excluded_selector): - element.extract() - - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.select(target_element)) - content_element = soup.new_tag("div") - for el in for_content_targeted_element: - content_element.append(el) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body + element.extract() kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -976,6 +961,20 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = "" try: + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.select(target_element)) + content_element = soup.new_tag("div") + for el in for_content_targeted_element: + content_element.append(el) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body str_body = content_element.encode_contents().decode("utf-8") except Exception: # Reset body to the original HTML @@ -1532,20 +1531,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) - content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body - # Remove script and style tags for tag in ["script", "style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): @@ -1614,6 +1599,19 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): ) # Generate output HTML + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body cleaned_html = lhtml.tostring( # body, content_element,