feat(scraping): add smart table extraction and analysis capabilities

Add comprehensive table detection and extraction functionality to the web scraping system: - Implement intelligent table detection algorithm with scoring system - Add table extraction with support for headers, rows, captions - Update models to include tables in Media class - Add table_score_threshold configuration option - Add documentation and examples for table extraction - Include crypto analysis example demonstrating table usage This change enables users to extract structured data from HTML tables while intelligently filtering out layout tables.
2025-03-09 21:31:33 +08:00
parent c6a605ccce
commit 9d69fce834
5 changed files with 586 additions and 9 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -597,6 +597,8 @@ class CrawlerRunConfig():
                                     Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
        exclude_external_images (bool): If True, exclude all external images from processing.
                                         Default: False.
+        table_score_threshold (int): Minimum score threshold for processing a table.
+                                     Default: 7.

        # Link and Domain Handling Parameters
        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
@@ -698,6 +700,7 @@ class CrawlerRunConfig():
        pdf: bool = False,
        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
+        table_score_threshold: int = 7,
        exclude_external_images: bool = False,
        # Link and Domain Handling Parameters
        exclude_social_media_domains: list = None,
@@ -783,6 +786,7 @@ class CrawlerRunConfig():
        self.image_description_min_word_threshold = image_description_min_word_threshold
        self.image_score_threshold = image_score_threshold
        self.exclude_external_images = exclude_external_images
+        self.table_score_threshold = table_score_threshold

        # Link and Domain Handling Parameters
        self.exclude_social_media_domains = (
@@ -913,6 +917,7 @@ class CrawlerRunConfig():
            image_score_threshold=kwargs.get(
                "image_score_threshold", IMAGE_SCORE_THRESHOLD
            ),
+            table_score_threshold=kwargs.get("table_score_threshold", 7),
            exclude_external_images=kwargs.get("exclude_external_images", False),
            # Link and Domain Handling Parameters
            exclude_social_media_domains=kwargs.get(
@@ -1001,6 +1006,7 @@ class CrawlerRunConfig():
            "pdf": self.pdf,
            "image_description_min_word_threshold": self.image_description_min_word_threshold,
            "image_score_threshold": self.image_score_threshold,
+            "table_score_threshold": self.table_score_threshold,
            "exclude_external_images": self.exclude_external_images,
            "exclude_social_media_domains": self.exclude_social_media_domains,
            "exclude_external_links": self.exclude_external_links,
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -155,6 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                for aud in raw_result.get("media", {}).get("audios", [])
                if aud
            ],
+            tables=raw_result.get("media", {}).get("tables", [])
        )

        # Convert links
@@ -193,6 +194,139 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)

+    def is_data_table(self, table: Tag, **kwargs) -> bool:
+        """
+        Determine if a table element is a data table (not a layout table).
+
+        Args:
+            table (Tag): BeautifulSoup Tag representing a table element
+            **kwargs: Additional keyword arguments including table_score_threshold
+
+        Returns:
+            bool: True if the table is a data table, False otherwise
+        """
+        score = 0
+        
+        # Check for thead and tbody
+        has_thead = len(table.select('thead')) > 0
+        has_tbody = len(table.select('tbody')) > 0
+        if has_thead:
+            score += 2
+        if has_tbody:
+            score += 1
+            
+        # Check for th elements
+        th_count = len(table.select('th'))
+        if th_count > 0:
+            score += 2
+            if has_thead or len(table.select('tr:first-child th')) > 0:
+                score += 1
+                
+        # Check for nested tables
+        if len(table.select('table')) > 0:
+            score -= 3
+            
+        # Role attribute check
+        role = table.get('role', '').lower()
+        if role in {'presentation', 'none'}:
+            score -= 3
+            
+        # Column consistency
+        rows = table.select('tr')
+        if not rows:
+            return False
+            
+        col_counts = [len(row.select('td, th')) for row in rows]
+        avg_cols = sum(col_counts) / len(col_counts)
+        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
+        if variance < 1:
+            score += 2
+            
+        # Caption and summary
+        if table.select('caption'):
+            score += 2
+        if table.has_attr('summary') and table['summary']:
+            score += 1
+            
+        # Text density
+        total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
+        total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
+        text_ratio = total_text / (total_tags + 1e-5)
+        if text_ratio > 20:
+            score += 3
+        elif text_ratio > 10:
+            score += 2
+            
+        # Data attributes
+        data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
+        score += data_attrs * 0.5
+        
+        # Size check
+        if avg_cols >= 2 and len(rows) >= 2:
+            score += 2
+            
+        threshold = kwargs.get('table_score_threshold', 7)
+        return score >= threshold
+    
+    def extract_table_data(self, table: Tag) -> dict:
+        """
+        Extract structured data from a table element.
+        
+        Args:
+            table (Tag): BeautifulSoup Tag representing a table element
+            
+        Returns:
+            dict: Dictionary containing table data (headers, rows, caption, summary)
+        """
+        caption_elem = table.select_one('caption')
+        caption = caption_elem.get_text().strip() if caption_elem else ""
+        summary = table.get('summary', '').strip()
+        
+        # Extract headers with colspan handling
+        headers = []
+        thead_rows = table.select('thead tr')
+        if thead_rows:
+            header_cells = thead_rows[0].select('th')
+            for cell in header_cells:
+                text = cell.get_text().strip()
+                colspan = int(cell.get('colspan', 1))
+                headers.extend([text] * colspan)
+        else:
+            first_row = table.select('tr:first-child')
+            if first_row:
+                for cell in first_row[0].select('th, td'):
+                    text = cell.get_text().strip()
+                    colspan = int(cell.get('colspan', 1))
+                    headers.extend([text] * colspan)
+        
+        # Extract rows with colspan handling
+        rows = []
+        for row in table.select('tr:not(:has(ancestor::thead))'):
+            row_data = []
+            for cell in row.select('td'):
+                text = cell.get_text().strip()
+                colspan = int(cell.get('colspan', 1))
+                row_data.extend([text] * colspan)
+            if row_data:
+                rows.append(row_data)
+                
+        # Align rows with headers
+        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
+        aligned_rows = []
+        for row in rows:
+            aligned = row[:max_columns] + [''] * (max_columns - len(row))
+            aligned_rows.append(aligned)
+            
+        if not headers:
+            headers = [f"Column {i+1}" for i in range(max_columns)]
+            
+        return {
+            "headers": headers,
+            "rows": aligned_rows,
+            "caption": caption,
+            "summary": summary,
+        }
+    
    def flatten_nested_elements(self, node):
        """
        Flatten nested elements in a HTML tree.
@@ -431,7 +565,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        Returns:
            dict: A dictionary containing the processed element information.
        """
-        media = {"images": [], "videos": [], "audios": []}
+        media = {"images": [], "videos": [], "audios": [], "tables": []}
        internal_links_dict = {}
        external_links_dict = {}
        self._process_element(
@@ -797,6 +931,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            if result is not None
            for img in result
        ]
+        
+        # Process tables if not excluded
+        excluded_tags = set(kwargs.get("excluded_tags", []) or [])
+        if 'table' not in excluded_tags:
+            tables = body.find_all('table')
+            for table in tables:
+                if self.is_data_table(table, **kwargs):
+                    table_data = self.extract_table_data(table)
+                    media["tables"].append(table_data)

        body = self.flatten_nested_elements(body)
        base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
@@ -847,8 +990,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        cleaned_html = str_body.replace("\n\n", "\n").replace("  ", " ")

        return {
-            # **markdown_content,
-            # "scraped_html": html,
            "cleaned_html": cleaned_html,
            "success": success,
            "media": media,
@@ -1188,6 +1329,118 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):

        return root

+    def is_data_table(self, table: etree.Element, **kwargs) -> bool:
+        score = 0
+        # Check for thead and tbody
+        has_thead = len(table.xpath(".//thead")) > 0
+        has_tbody = len(table.xpath(".//tbody")) > 0
+        if has_thead:
+            score += 2
+        if has_tbody:
+            score += 1
+
+        # Check for th elements
+        th_count = len(table.xpath(".//th"))
+        if th_count > 0:
+            score += 2
+            if has_thead or table.xpath(".//tr[1]/th"):
+                score += 1
+
+        # Check for nested tables
+        if len(table.xpath(".//table")) > 0:
+            score -= 3
+
+        # Role attribute check
+        role = table.get("role", "").lower()
+        if role in {"presentation", "none"}:
+            score -= 3
+
+        # Column consistency
+        rows = table.xpath(".//tr")
+        if not rows:
+            return False
+        col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
+        avg_cols = sum(col_counts) / len(col_counts)
+        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
+        if variance < 1:
+            score += 2
+
+        # Caption and summary
+        if table.xpath(".//caption"):
+            score += 2
+        if table.get("summary"):
+            score += 1
+
+        # Text density
+        total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
+        total_tags = sum(1 for _ in table.iterdescendants())
+        text_ratio = total_text / (total_tags + 1e-5)
+        if text_ratio > 20:
+            score += 3
+        elif text_ratio > 10:
+            score += 2
+
+        # Data attributes
+        data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
+        score += data_attrs * 0.5
+
+        # Size check
+        if avg_cols >= 2 and len(rows) >= 2:
+            score += 2
+
+        threshold = kwargs.get("table_score_threshold", 7)
+        return score >= threshold
+
+    def extract_table_data(self, table: etree.Element) -> dict:
+        caption = table.xpath(".//caption/text()")
+        caption = caption[0].strip() if caption else ""
+        summary = table.get("summary", "").strip()
+
+        # Extract headers with colspan handling
+        headers = []
+        thead_rows = table.xpath(".//thead/tr")
+        if thead_rows:
+            header_cells = thead_rows[0].xpath(".//th")
+            for cell in header_cells:
+                text = cell.text_content().strip()
+                colspan = int(cell.get("colspan", 1))
+                headers.extend([text] * colspan)
+        else:
+            first_row = table.xpath(".//tr[1]")
+            if first_row:
+                for cell in first_row[0].xpath(".//th|.//td"):
+                    text = cell.text_content().strip()
+                    colspan = int(cell.get("colspan", 1))
+                    headers.extend([text] * colspan)
+
+        # Extract rows with colspan handling
+        rows = []
+        for row in table.xpath(".//tr[not(ancestor::thead)]"):
+            row_data = []
+            for cell in row.xpath(".//td"):
+                text = cell.text_content().strip()
+                colspan = int(cell.get("colspan", 1))
+                row_data.extend([text] * colspan)
+            if row_data:
+                rows.append(row_data)
+
+        # Align rows with headers
+        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
+        aligned_rows = []
+        for row in rows:
+            aligned = row[:max_columns] + [''] * (max_columns - len(row))
+            aligned_rows.append(aligned)
+
+        if not headers:
+            headers = [f"Column {i+1}" for i in range(max_columns)]
+
+        return {
+            "headers": headers,
+            "rows": aligned_rows,
+            "caption": caption,
+            "summary": summary,
+        }
+
    def _scrap(
        self,
        url: str,
@@ -1285,7 +1538,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                        form.getparent().remove(form)

            # Process content
-            media = {"images": [], "videos": [], "audios": []}
+            media = {"images": [], "videos": [], "audios": [], "tables": []}
            internal_links_dict = {}
            external_links_dict = {}

@@ -1299,6 +1552,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                **kwargs,
            )

+            if 'table' not in excluded_tags:
+                tables = body.xpath(".//table")
+                for table in tables:
+                    if self.is_data_table(table, **kwargs):
+                        table_data = self.extract_table_data(table)
+                        media["tables"].append(table_data)
+
            # Handle only_text option
            if kwargs.get("only_text", False):
                for tag in ONLY_TEXT_ELIGIBLE_TAGS:
@@ -1370,7 +1630,12 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            return {
                "cleaned_html": cleaned_html,
                "success": False,
-                "media": {"images": [], "videos": [], "audios": []},
+                "media": {
+                    "images": [],
+                    "videos": [],
+                    "audios": [],
+                    "tables": []
+                },
                "links": {"internal": [], "external": []},
                "metadata": {},
            }
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -326,6 +326,7 @@ class Media(BaseModel):
    audios: List[
        MediaItem
    ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+    tables: List[Dict] = []  # Table data extracted from HTML tables


 class Links(BaseModel):