feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables

BREAKING CHANGE: Table extraction now uses Strategy Design Pattern This epic commit introduces a game-changing approach to table extraction in Crawl4AI: ✨ NEW FEATURES: - LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan - Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries - Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction - Intelligent Merging: Seamlessly combines chunk results into complete tables - Header Preservation: Each chunk maintains context with original headers - Auto-retry Logic: Built-in resilience with configurable retry attempts 🏗️ ARCHITECTURE: - Strategy Design Pattern for pluggable table extraction strategies - ThreadPoolExecutor for concurrent chunk processing - Token-based chunking with configurable thresholds - Handles tables without headers gracefully ⚡ PERFORMANCE: - Process 1000+ row tables without timeout - Parallel processing with up to 5 concurrent chunks - Smart token estimation prevents LLM context overflow - Optimized for providers like Groq for massive tables 🔧 CONFIGURATION: - enable_chunking: Auto-handle large tables (default: True) - chunk_token_threshold: When to split (default: 3000 tokens) - min_rows_per_chunk: Meaningful chunk sizes (default: 10) - max_parallel_chunks: Concurrent processing (default: 5) 📚 BACKWARD COMPATIBILITY: - Existing code continues to work unchanged - DefaultTableExtraction remains the default strategy - Progressive enhancement approach This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
2025-08-14 18:21:24 +08:00
parent 926e41aab8
commit a51545c883
9 changed files with 3536 additions and 116 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -29,6 +29,12 @@ from .extraction_strategy import (
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
+from .table_extraction import (
+    TableExtractionStrategy,
+    DefaultTableExtraction,
+    NoTableExtraction,
+    LLMTableExtraction,
+)
 from .content_filter_strategy import (
    PruningContentFilter,
    BM25ContentFilter,
@@ -156,6 +162,9 @@ __all__ = [
    "ChunkingStrategy",
    "RegexChunking",
    "DefaultMarkdownGenerator",
+    "TableExtractionStrategy",
+    "DefaultTableExtraction",
+    "NoTableExtraction",
    "RelevantContentFilter",
    "PruningContentFilter",
    "BM25ContentFilter",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -20,6 +20,7 @@ from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
 from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
 from .deep_crawling import DeepCrawlStrategy
+from .table_extraction import TableExtractionStrategy, DefaultTableExtraction

 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy
@@ -982,6 +983,8 @@ class CrawlerRunConfig():
                                         Default: False.
        table_score_threshold (int): Minimum score threshold for processing a table.
                                     Default: 7.
+        table_extraction (TableExtractionStrategy): Strategy to use for table extraction.
+                                     Default: DefaultTableExtraction with table_score_threshold.

        # Virtual Scroll Parameters
        virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
@@ -1108,6 +1111,7 @@ class CrawlerRunConfig():
        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
        table_score_threshold: int = 7,
+        table_extraction: TableExtractionStrategy = None,
        exclude_external_images: bool = False,
        exclude_all_images: bool = False,
        # Link and Domain Handling Parameters
@@ -1224,6 +1228,12 @@ class CrawlerRunConfig():
        self.exclude_external_images = exclude_external_images
        self.exclude_all_images = exclude_all_images
        self.table_score_threshold = table_score_threshold
+        
+        # Table extraction strategy (default to DefaultTableExtraction if not specified)
+        if table_extraction is None:
+            self.table_extraction = DefaultTableExtraction(table_score_threshold=table_score_threshold)
+        else:
+            self.table_extraction = table_extraction

        # Link and Domain Handling Parameters
        self.exclude_social_media_domains = (
@@ -1495,6 +1505,7 @@ class CrawlerRunConfig():
                "image_score_threshold", IMAGE_SCORE_THRESHOLD
            ),
            table_score_threshold=kwargs.get("table_score_threshold", 7),
+            table_extraction=kwargs.get("table_extraction", None),
            exclude_all_images=kwargs.get("exclude_all_images", False),
            exclude_external_images=kwargs.get("exclude_external_images", False),
            # Link and Domain Handling Parameters
@@ -1603,6 +1614,7 @@ class CrawlerRunConfig():
            "image_description_min_word_threshold": self.image_description_min_word_threshold,
            "image_score_threshold": self.image_score_threshold,
            "table_score_threshold": self.table_score_threshold,
+            "table_extraction": self.table_extraction,
            "exclude_all_images": self.exclude_all_images,
            "exclude_external_images": self.exclude_external_images,
            "exclude_social_media_domains": self.exclude_social_media_domains,
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -586,117 +586,6 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):

        return root

-    def is_data_table(self, table: etree.Element, **kwargs) -> bool:
-        score = 0
-        # Check for thead and tbody
-        has_thead = len(table.xpath(".//thead")) > 0
-        has_tbody = len(table.xpath(".//tbody")) > 0
-        if has_thead:
-            score += 2
-        if has_tbody:
-            score += 1
-
-        # Check for th elements
-        th_count = len(table.xpath(".//th"))
-        if th_count > 0:
-            score += 2
-            if has_thead or table.xpath(".//tr[1]/th"):
-                score += 1
-
-        # Check for nested tables
-        if len(table.xpath(".//table")) > 0:
-            score -= 3
-
-        # Role attribute check
-        role = table.get("role", "").lower()
-        if role in {"presentation", "none"}:
-            score -= 3
-
-        # Column consistency
-        rows = table.xpath(".//tr")
-        if not rows:
-            return False
-        col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
-        avg_cols = sum(col_counts) / len(col_counts)
-        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
-        if variance < 1:
-            score += 2
-
-        # Caption and summary
-        if table.xpath(".//caption"):
-            score += 2
-        if table.get("summary"):
-            score += 1
-
-        # Text density
-        total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
-        total_tags = sum(1 for _ in table.iterdescendants())
-        text_ratio = total_text / (total_tags + 1e-5)
-        if text_ratio > 20:
-            score += 3
-        elif text_ratio > 10:
-            score += 2
-
-        # Data attributes
-        data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
-        score += data_attrs * 0.5
-
-        # Size check
-        if avg_cols >= 2 and len(rows) >= 2:
-            score += 2
-
-        threshold = kwargs.get("table_score_threshold", 7)
-        return score >= threshold
-
-    def extract_table_data(self, table: etree.Element) -> dict:
-        caption = table.xpath(".//caption/text()")
-        caption = caption[0].strip() if caption else ""
-        summary = table.get("summary", "").strip()
-
-        # Extract headers with colspan handling
-        headers = []
-        thead_rows = table.xpath(".//thead/tr")
-        if thead_rows:
-            header_cells = thead_rows[0].xpath(".//th")
-            for cell in header_cells:
-                text = cell.text_content().strip()
-                colspan = int(cell.get("colspan", 1))
-                headers.extend([text] * colspan)
-        else:
-            first_row = table.xpath(".//tr[1]")
-            if first_row:
-                for cell in first_row[0].xpath(".//th|.//td"):
-                    text = cell.text_content().strip()
-                    colspan = int(cell.get("colspan", 1))
-                    headers.extend([text] * colspan)
-
-        # Extract rows with colspan handling
-        rows = []
-        for row in table.xpath(".//tr[not(ancestor::thead)]"):
-            row_data = []
-            for cell in row.xpath(".//td"):
-                text = cell.text_content().strip()
-                colspan = int(cell.get("colspan", 1))
-                row_data.extend([text] * colspan)
-            if row_data:
-                rows.append(row_data)
-
-        # Align rows with headers
-        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
-        aligned_rows = []
-        for row in rows:
-            aligned = row[:max_columns] + [''] * (max_columns - len(row))
-            aligned_rows.append(aligned)
-
-        if not headers:
-            headers = [f"Column {i+1}" for i in range(max_columns)]
-
-        return {
-            "headers": headers,
-            "rows": aligned_rows,
-            "caption": caption,
-            "summary": summary,
-        }

    def _scrap(
        self,
@@ -839,12 +728,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                **kwargs,
            )

+            # Extract tables using the table extraction strategy if provided
            if 'table' not in excluded_tags:
-                tables = body.xpath(".//table")
-                for table in tables:
-                    if self.is_data_table(table, **kwargs):
-                        table_data = self.extract_table_data(table)
-                        media["tables"].append(table_data)
+                table_extraction = kwargs.get('table_extraction')
+                if table_extraction:
+                    # Pass logger to the strategy if it doesn't have one
+                    if not table_extraction.logger:
+                        table_extraction.logger = self.logger
+                    # Extract tables using the strategy
+                    extracted_tables = table_extraction.extract_tables(body, **kwargs)
+                    media["tables"].extend(extracted_tables)

            # Handle only_text option
            if kwargs.get("only_text", False):
--- a/crawl4ai/table_extraction.py
+++ b/crawl4ai/table_extraction.py