From c2902fd200fa5ad354da33d8528a12844b3c75be Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 19 Apr 2025 19:46:20 +0530 Subject: [PATCH] reverse:last change in order of execution for it introduced a new issue in content generated. https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 58 ++++++++++++++------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index aa69c5fb..814e4b2b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -901,7 +901,22 @@ class WebScrapingStrategy(ContentScrapingStrategy): element.extract() else: for element in body.select(excluded_selector): - element.extract() + element.extract() + + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.select(target_element)) + content_element = soup.new_tag("div") + for el in for_content_targeted_element: + content_element.append(el) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -961,20 +976,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = "" try: - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.select(target_element)) - content_element = soup.new_tag("div") - for el in for_content_targeted_element: - content_element.append(el) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body str_body = content_element.encode_contents().decode("utf-8") except Exception: # Reset body to the original HTML @@ -1531,6 +1532,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body + # Remove script and style tags for tag in ["script", "style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): @@ -1599,19 +1614,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): ) # Generate output HTML - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) - content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body cleaned_html = lhtml.tostring( # body, content_element,