feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables

BREAKING CHANGE: Table extraction now uses Strategy Design Pattern This epic commit introduces a game-changing approach to table extraction in Crawl4AI: ✨ NEW FEATURES: - LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan - Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries - Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction - Intelligent Merging: Seamlessly combines chunk results into complete tables - Header Preservation: Each chunk maintains context with original headers - Auto-retry Logic: Built-in resilience with configurable retry attempts 🏗️ ARCHITECTURE: - Strategy Design Pattern for pluggable table extraction strategies - ThreadPoolExecutor for concurrent chunk processing - Token-based chunking with configurable thresholds - Handles tables without headers gracefully ⚡ PERFORMANCE: - Process 1000+ row tables without timeout - Parallel processing with up to 5 concurrent chunks - Smart token estimation prevents LLM context overflow - Optimized for providers like Groq for massive tables 🔧 CONFIGURATION: - enable_chunking: Auto-handle large tables (default: True) - chunk_token_threshold: When to split (default: 3000 tokens) - min_rows_per_chunk: Meaningful chunk sizes (default: 10) - max_parallel_chunks: Concurrent processing (default: 5) 📚 BACKWARD COMPATIBILITY: - Existing code continues to work unchanged - DefaultTableExtraction remains the default strategy - Progressive enhancement approach This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
2025-08-14 18:21:24 +08:00
parent 926e41aab8
commit a51545c883
9 changed files with 3536 additions and 116 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -586,117 +586,6 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):

        return root

-    def is_data_table(self, table: etree.Element, **kwargs) -> bool:
-        score = 0
-        # Check for thead and tbody
-        has_thead = len(table.xpath(".//thead")) > 0
-        has_tbody = len(table.xpath(".//tbody")) > 0
-        if has_thead:
-            score += 2
-        if has_tbody:
-            score += 1
-
-        # Check for th elements
-        th_count = len(table.xpath(".//th"))
-        if th_count > 0:
-            score += 2
-            if has_thead or table.xpath(".//tr[1]/th"):
-                score += 1
-
-        # Check for nested tables
-        if len(table.xpath(".//table")) > 0:
-            score -= 3
-
-        # Role attribute check
-        role = table.get("role", "").lower()
-        if role in {"presentation", "none"}:
-            score -= 3
-
-        # Column consistency
-        rows = table.xpath(".//tr")
-        if not rows:
-            return False
-        col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
-        avg_cols = sum(col_counts) / len(col_counts)
-        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
-        if variance < 1:
-            score += 2
-
-        # Caption and summary
-        if table.xpath(".//caption"):
-            score += 2
-        if table.get("summary"):
-            score += 1
-
-        # Text density
-        total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
-        total_tags = sum(1 for _ in table.iterdescendants())
-        text_ratio = total_text / (total_tags + 1e-5)
-        if text_ratio > 20:
-            score += 3
-        elif text_ratio > 10:
-            score += 2
-
-        # Data attributes
-        data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
-        score += data_attrs * 0.5
-
-        # Size check
-        if avg_cols >= 2 and len(rows) >= 2:
-            score += 2
-
-        threshold = kwargs.get("table_score_threshold", 7)
-        return score >= threshold
-
-    def extract_table_data(self, table: etree.Element) -> dict:
-        caption = table.xpath(".//caption/text()")
-        caption = caption[0].strip() if caption else ""
-        summary = table.get("summary", "").strip()
-
-        # Extract headers with colspan handling
-        headers = []
-        thead_rows = table.xpath(".//thead/tr")
-        if thead_rows:
-            header_cells = thead_rows[0].xpath(".//th")
-            for cell in header_cells:
-                text = cell.text_content().strip()
-                colspan = int(cell.get("colspan", 1))
-                headers.extend([text] * colspan)
-        else:
-            first_row = table.xpath(".//tr[1]")
-            if first_row:
-                for cell in first_row[0].xpath(".//th|.//td"):
-                    text = cell.text_content().strip()
-                    colspan = int(cell.get("colspan", 1))
-                    headers.extend([text] * colspan)
-
-        # Extract rows with colspan handling
-        rows = []
-        for row in table.xpath(".//tr[not(ancestor::thead)]"):
-            row_data = []
-            for cell in row.xpath(".//td"):
-                text = cell.text_content().strip()
-                colspan = int(cell.get("colspan", 1))
-                row_data.extend([text] * colspan)
-            if row_data:
-                rows.append(row_data)
-
-        # Align rows with headers
-        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
-        aligned_rows = []
-        for row in rows:
-            aligned = row[:max_columns] + [''] * (max_columns - len(row))
-            aligned_rows.append(aligned)
-
-        if not headers:
-            headers = [f"Column {i+1}" for i in range(max_columns)]
-
-        return {
-            "headers": headers,
-            "rows": aligned_rows,
-            "caption": caption,
-            "summary": summary,
-        }

    def _scrap(
        self,
@@ -839,12 +728,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                **kwargs,
            )

+            # Extract tables using the table extraction strategy if provided
            if 'table' not in excluded_tags:
-                tables = body.xpath(".//table")
-                for table in tables:
-                    if self.is_data_table(table, **kwargs):
-                        table_data = self.extract_table_data(table)
-                        media["tables"].append(table_data)
+                table_extraction = kwargs.get('table_extraction')
+                if table_extraction:
+                    # Pass logger to the strategy if it doesn't have one
+                    if not table_extraction.logger:
+                        table_extraction.logger = self.logger
+                    # Extract tables using the strategy
+                    extracted_tables = table_extraction.extract_tables(body, **kwargs)
+                    media["tables"].extend(extracted_tables)

            # Handle only_text option
            if kwargs.get("only_text", False):