feat(content): add target_elements parameter for selective content extraction

Adds new target_elements parameter to CrawlerRunConfig that allows more flexible content selection than css_selector. This enables focusing markdown generation and data extraction on specific elements while still processing the entire page for links and media. Key changes: - Added target_elements list parameter to CrawlerRunConfig - Modified WebScrapingStrategy and LXMLWebScrapingStrategy to handle target_elements - Updated documentation with examples and comparison between css_selector and target_elements - Fixed table extraction in content_scraping_strategy.py BREAKING CHANGE: Table extraction logic has been modified to better handle thead/tbody structures
2025-03-10 18:54:51 +08:00
parent 9d69fce834
commit 9547bada3a
7 changed files with 188 additions and 47 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -11,7 +11,7 @@ from .config import (
 )

 from .user_agent_generator import UAGen, ValidUAGenerator  # , OnlineUAGenerator
-from .extraction_strategy import ExtractionStrategy
+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking

 from .markdown_generation_strategy import MarkdownGenerationStrategy
@@ -501,6 +501,15 @@ class CrawlerRunConfig():
                          Default: False.
        css_selector (str or None): CSS selector to extract a specific portion of the page.
                                    Default: None.
+        
+        target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation 
+                                                and structured data extraction. When you set this, only the contents 
+                                                of these elements are processed for extraction and Markdown generation. 
+                                                If you do not set any value, the entire page is processed. 
+                                                The difference between this and css_selector is that this will shrink 
+                                                the initial raw HTML to the selected element, while this will only affect 
+                                                the extraction and Markdown generation.
+                                    Default: None
        excluded_tags (list of str or None): List of HTML tags to exclude from processing.
                                             Default: None.
        excluded_selector (str or None): CSS selector to exclude from processing.
@@ -652,6 +661,7 @@ class CrawlerRunConfig():
        markdown_generator: MarkdownGenerationStrategy = None,
        only_text: bool = False,
        css_selector: str = None,
+        target_elements: List[str] = None,
        excluded_tags: list = None,
        excluded_selector: str = None,
        keep_data_attributes: bool = False,
@@ -732,6 +742,7 @@ class CrawlerRunConfig():
        self.markdown_generator = markdown_generator
        self.only_text = only_text
        self.css_selector = css_selector
+        self.target_elements = target_elements or []
        self.excluded_tags = excluded_tags or []
        self.excluded_selector = excluded_selector or ""
        self.keep_data_attributes = keep_data_attributes
@@ -862,6 +873,7 @@ class CrawlerRunConfig():
            markdown_generator=kwargs.get("markdown_generator"),
            only_text=kwargs.get("only_text", False),
            css_selector=kwargs.get("css_selector"),
+            target_elements=kwargs.get("target_elements", []),
            excluded_tags=kwargs.get("excluded_tags", []),
            excluded_selector=kwargs.get("excluded_selector", ""),
            keep_data_attributes=kwargs.get("keep_data_attributes", False),
@@ -963,6 +975,7 @@ class CrawlerRunConfig():
            "markdown_generator": self.markdown_generator,
            "only_text": self.only_text,
            "css_selector": self.css_selector,
+            "target_elements": self.target_elements,
            "excluded_tags": self.excluded_tags,
            "excluded_selector": self.excluded_selector,
            "keep_data_attributes": self.keep_data_attributes,
@@ -1099,3 +1112,5 @@ class LLMConfig:
        config_dict = self.to_dict()
        config_dict.update(kwargs)
        return LLMConfig.from_kwargs(config_dict)
+
+
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -514,7 +514,8 @@ class AsyncWebCrawler:
                scraping_strategy.logger = self.logger

            # Process HTML content
-            params = {k: v for k, v in config.to_dict().items() if k not in ["url"]}
+            params = config.__dict__.copy()
+            params.pop("url", None)            
            # add keys from kwargs to params that doesn't exist in params
            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})

--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -301,7 +301,21 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        
        # Extract rows with colspan handling
        rows = []
-        for row in table.select('tr:not(:has(ancestor::thead))'):
+        all_rows = table.select('tr')
+        thead = table.select_one('thead')
+        tbody_rows = []
+
+        if thead:
+            thead_rows = thead.select('tr')
+            tbody_rows = [row for row in all_rows if row not in thead_rows]
+        else:
+            if all_rows and all_rows[0].select('th'):
+                tbody_rows = all_rows[1:]
+            else:
+                tbody_rows = all_rows
+                
+        for row in tbody_rows:        
+        # for row in table.select('tr:not(:has(ancestor::thead))'):
            row_data = []
            for cell in row.select('td'):
                text = cell.get_text().strip()
@@ -822,6 +836,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        html: str,
        word_count_threshold: int = MIN_WORD_THRESHOLD,
        css_selector: str = None,
+        target_elements: List[str] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        """
@@ -876,22 +891,37 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                for element in body.select(excluded_selector):
                    element.extract()

-        if False and css_selector:
-            selected_elements = body.select(css_selector)
-            if not selected_elements:
-                return {
-                    "markdown": "",
-                    "cleaned_html": "",
-                    "success": True,
-                    "media": {"images": [], "videos": [], "audios": []},
-                    "links": {"internal": [], "external": []},
-                    "metadata": {},
-                    "message": f"No elements found for CSS selector: {css_selector}",
-                }
-                # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
-            body = soup.new_tag("div")
-            for el in selected_elements:
-                body.append(el)
+        # if False and css_selector:
+        #     selected_elements = body.select(css_selector)
+        #     if not selected_elements:
+        #         return {
+        #             "markdown": "",
+        #             "cleaned_html": "",
+        #             "success": True,
+        #             "media": {"images": [], "videos": [], "audios": []},
+        #             "links": {"internal": [], "external": []},
+        #             "metadata": {},
+        #             "message": f"No elements found for CSS selector: {css_selector}",
+        #         }
+        #         # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
+        #     body = soup.new_tag("div")
+        #     for el in selected_elements:
+        #         body.append(el)
+
+        content_element = None
+        if target_elements:
+            try:
+                for_content_targeted_element = []
+                for target_element in target_elements:
+                    for_content_targeted_element.extend(body.select(target_element))
+                content_element = soup.new_tag("div")
+                for el in for_content_targeted_element:
+                    content_element.append(el)
+            except Exception as e:
+                self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
+                return None
+        else:
+            content_element = body        

        kwargs["exclude_social_media_domains"] = set(
            kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -951,7 +981,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):

        str_body = ""
        try:
-            str_body = body.encode_contents().decode("utf-8")
+            str_body = content_element.encode_contents().decode("utf-8")
        except Exception:
            # Reset body to the original HTML
            success = False
@@ -1447,6 +1477,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
        html: str,
        word_count_threshold: int = MIN_WORD_THRESHOLD,
        css_selector: str = None,
+        target_elements: List[str] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        if not html:
@@ -1497,24 +1528,38 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                meta = {}

            # Handle CSS selector targeting
-            if css_selector:
+            # if css_selector:
+            #     try:
+            #         selected_elements = body.cssselect(css_selector)
+            #         if not selected_elements:
+            #             return {
+            #                 "markdown": "",
+            #                 "cleaned_html": "",
+            #                 "success": True,
+            #                 "media": {"images": [], "videos": [], "audios": []},
+            #                 "links": {"internal": [], "external": []},
+            #                 "metadata": meta,
+            #                 "message": f"No elements found for CSS selector: {css_selector}",
+            #             }
+            #         body = lhtml.Element("div")
+            #         body.extend(selected_elements)
+            #     except Exception as e:
+            #         self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
+            #         return None
+
+            content_element = None
+            if target_elements:
                try:
-                    selected_elements = body.cssselect(css_selector)
-                    if not selected_elements:
-                        return {
-                            "markdown": "",
-                            "cleaned_html": "",
-                            "success": True,
-                            "media": {"images": [], "videos": [], "audios": []},
-                            "links": {"internal": [], "external": []},
-                            "metadata": meta,
-                            "message": f"No elements found for CSS selector: {css_selector}",
-                        }
-                    body = lhtml.Element("div")
-                    body.extend(selected_elements)
+                    for_content_targeted_element = []
+                    for target_element in target_elements:
+                        for_content_targeted_element.extend(body.cssselect(target_element))
+                    content_element = lhtml.Element("div")
+                    content_element.extend(for_content_targeted_element)
                except Exception as e:
-                    self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
+                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                    return None
+            else:
+                content_element = body

            # Remove script and style tags
            for tag in ["script", "style", "link", "meta", "noscript"]:
@@ -1585,7 +1630,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):

            # Generate output HTML
            cleaned_html = lhtml.tostring(
-                body,
+                # body,   
+                content_element,
                encoding="unicode",
                pretty_print=True,
                method="html",