feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables

BREAKING CHANGE: Table extraction now uses Strategy Design Pattern This epic commit introduces a game-changing approach to table extraction in Crawl4AI: ✨ NEW FEATURES: - LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan - Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries - Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction - Intelligent Merging: Seamlessly combines chunk results into complete tables - Header Preservation: Each chunk maintains context with original headers - Auto-retry Logic: Built-in resilience with configurable retry attempts 🏗️ ARCHITECTURE: - Strategy Design Pattern for pluggable table extraction strategies - ThreadPoolExecutor for concurrent chunk processing - Token-based chunking with configurable thresholds - Handles tables without headers gracefully ⚡ PERFORMANCE: - Process 1000+ row tables without timeout - Parallel processing with up to 5 concurrent chunks - Smart token estimation prevents LLM context overflow - Optimized for providers like Groq for massive tables 🔧 CONFIGURATION: - enable_chunking: Auto-handle large tables (default: True) - chunk_token_threshold: When to split (default: 3000 tokens) - min_rows_per_chunk: Meaningful chunk sizes (default: 10) - max_parallel_chunks: Concurrent processing (default: 5) 📚 BACKWARD COMPATIBILITY: - Existing code continues to work unchanged - DefaultTableExtraction remains the default strategy - Progressive enhancement approach This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
2025-08-14 18:21:24 +08:00
parent 926e41aab8
commit a51545c883
9 changed files with 3536 additions and 116 deletions
--- a/docs/examples/llm_table_extraction_example.py
+++ b/docs/examples/llm_table_extraction_example.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating LLM-based table extraction in Crawl4AI.
+
+This example shows how to use the LLMTableExtraction strategy to extract
+complex tables from web pages, including handling rowspan, colspan, and nested tables.
+"""
+
+import os
+import sys
+
+# Get the grandparent directory
+grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(grandparent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+
+
+import asyncio
+import json
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    LLMConfig,
+    LLMTableExtraction,
+    CacheMode
+)
+import pandas as pd
+
+
+# Example 1: Basic LLM Table Extraction
+async def basic_llm_extraction():
+    """Extract tables using LLM with default settings."""
+    print("\n=== Example 1: Basic LLM Table Extraction ===")
+    
+    # Configure LLM (using OpenAI GPT-4o-mini for cost efficiency)
+    llm_config = LLMConfig(
+        provider="openai/gpt-4.1-mini",
+        api_token="env:OPENAI_API_KEY",  # Uses environment variable
+        temperature=0.1,  # Low temperature for consistency
+        max_tokens=2000
+    )
+    
+    # Create LLM table extraction strategy
+    table_strategy = LLMTableExtraction(
+        llm_config=llm_config,
+        verbose=True
+    )
+    
+    # Configure crawler with the strategy
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        table_extraction=table_strategy
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        # Extract tables from a Wikipedia page
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+            config=config
+        )
+        
+        if result.success:
+            print(f"✓ Found {len(result.tables)} tables")
+            
+            # Display first table
+            if result.tables:
+                first_table = result.tables[0]
+                print(f"\nFirst table:")
+                print(f"  Headers: {first_table['headers'][:5]}...")
+                print(f"  Rows: {len(first_table['rows'])}")
+                
+                # Convert to pandas DataFrame
+                df = pd.DataFrame(
+                    first_table['rows'],
+                    columns=first_table['headers']
+                )
+                print(f"\nDataFrame shape: {df.shape}")
+                print(df.head())
+        else:
+            print(f"✗ Extraction failed: {result.error}")
+
+
+# Example 2: Focused Extraction with CSS Selector
+async def focused_extraction():
+    """Extract tables from specific page sections using CSS selectors."""
+    print("\n=== Example 2: Focused Extraction with CSS Selector ===")
+    
+    # HTML with multiple tables
+    test_html = """
+    <html>
+    <body>
+        <div class="sidebar">
+            <table role="presentation">
+                <tr><td>Navigation</td></tr>
+            </table>
+        </div>
+        
+        <div class="main-content">
+            <table id="data-table">
+                <caption>Quarterly Sales Report</caption>
+                <thead>
+                    <tr>
+                        <th rowspan="2">Product</th>
+                        <th colspan="3">Q1 2024</th>
+                    </tr>
+                    <tr>
+                        <th>Jan</th>
+                        <th>Feb</th>
+                        <th>Mar</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>Widget A</td>
+                        <td>100</td>
+                        <td>120</td>
+                        <td>140</td>
+                    </tr>
+                    <tr>
+                        <td>Widget B</td>
+                        <td>200</td>
+                        <td>180</td>
+                        <td>220</td>
+                    </tr>
+                </tbody>
+            </table>
+        </div>
+    </body>
+    </html>
+    """
+    
+    llm_config = LLMConfig(
+        provider="openai/gpt-4.1-mini",
+        api_token="env:OPENAI_API_KEY"
+    )
+    
+    # Focus only on main content area
+    table_strategy = LLMTableExtraction(
+        llm_config=llm_config,
+        css_selector=".main-content",  # Only extract from main content
+        verbose=True
+    )
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        table_extraction=table_strategy
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=f"raw:{test_html}",
+            config=config
+        )
+        
+        if result.success and result.tables:
+            table = result.tables[0]
+            print(f"✓ Extracted table: {table.get('caption', 'No caption')}")
+            print(f"  Headers: {table['headers']}")
+            print(f"  Metadata: {table['metadata']}")
+            
+            # The LLM should have handled the rowspan/colspan correctly
+            print("\nProcessed data (rowspan/colspan handled):")
+            for i, row in enumerate(table['rows']):
+                print(f"  Row {i+1}: {row}")
+
+
+# Example 3: Comparing with Default Extraction
+async def compare_strategies():
+    """Compare LLM extraction with default extraction on complex tables."""
+    print("\n=== Example 3: Comparing LLM vs Default Extraction ===")
+    
+    # Complex table with nested structure
+    complex_html = """
+    <html>
+    <body>
+        <table>
+            <tr>
+                <th rowspan="3">Category</th>
+                <th colspan="2">2023</th>
+                <th colspan="2">2024</th>
+            </tr>
+            <tr>
+                <th>H1</th>
+                <th>H2</th>
+                <th>H1</th>
+                <th>H2</th>
+            </tr>
+            <tr>
+                <td colspan="4">All values in millions</td>
+            </tr>
+            <tr>
+                <td>Revenue</td>
+                <td>100</td>
+                <td>120</td>
+                <td>130</td>
+                <td>145</td>
+            </tr>
+            <tr>
+                <td>Profit</td>
+                <td>20</td>
+                <td>25</td>
+                <td>28</td>
+                <td>32</td>
+            </tr>
+        </table>
+    </body>
+    </html>
+    """
+    
+    async with AsyncWebCrawler() as crawler:
+        # Test with default extraction
+        from crawl4ai import DefaultTableExtraction
+        
+        default_strategy = DefaultTableExtraction(
+            table_score_threshold=3,
+            verbose=True
+        )
+        
+        config_default = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_extraction=default_strategy
+        )
+        
+        result_default = await crawler.arun(
+            url=f"raw:{complex_html}",
+            config=config_default
+        )
+        
+        # Test with LLM extraction
+        llm_strategy = LLMTableExtraction(
+            llm_config=LLMConfig(
+                provider="openai/gpt-4.1-mini",
+                api_token="env:OPENAI_API_KEY"
+            ),
+            verbose=True
+        )
+        
+        config_llm = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_extraction=llm_strategy
+        )
+        
+        result_llm = await crawler.arun(
+            url=f"raw:{complex_html}",
+            config=config_llm
+        )
+        
+        # Compare results
+        print("\nDefault Extraction:")
+        if result_default.tables:
+            table = result_default.tables[0]
+            print(f"  Headers: {table.get('headers', [])}")
+            print(f"  Rows: {len(table.get('rows', []))}")
+            for i, row in enumerate(table.get('rows', [])[:3]):
+                print(f"    Row {i+1}: {row}")
+        
+        print("\nLLM Extraction (handles complex structure better):")
+        if result_llm.tables:
+            table = result_llm.tables[0]
+            print(f"  Headers: {table.get('headers', [])}")
+            print(f"  Rows: {len(table.get('rows', []))}")
+            for i, row in enumerate(table.get('rows', [])):
+                print(f"    Row {i+1}: {row}")
+            print(f"  Metadata: {table.get('metadata', {})}")
+
+
+# Example 4: Using Local Models (Ollama)
+async def local_model_extraction():
+    """Extract tables using local Ollama models for privacy/cost."""
+    print("\n=== Example 4: Local Model Extraction with Ollama ===")
+    
+    # Configure for local Ollama
+    llm_config = LLMConfig(
+        provider="ollama/llama3.3",
+        api_token=None,  # Not needed for Ollama
+        base_url="http://localhost:11434",
+        temperature=0.1
+    )
+    
+    table_strategy = LLMTableExtraction(
+        llm_config=llm_config,
+        verbose=True
+    )
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        table_extraction=table_strategy
+    )
+    
+    # Simple test HTML
+    test_html = """
+    <table>
+        <thead>
+            <tr><th>Product</th><th>Price</th><th>Stock</th></tr>
+        </thead>
+        <tbody>
+            <tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
+            <tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
+            <tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
+        </tbody>
+    </table>
+    """
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=f"raw:{test_html}",
+            config=config
+        )
+        
+        if result.success and result.tables:
+            table = result.tables[0]
+            print(f"✓ Extracted with local model:")
+            
+            # Create DataFrame
+            df = pd.DataFrame(table['rows'], columns=table['headers'])
+            print(df.to_string())
+        else:
+            print("✗ Make sure Ollama is running locally with llama3.3 model")
+
+
+# Example 5: Batch Processing Multiple Pages
+async def batch_extraction():
+    """Extract tables from multiple pages efficiently."""
+    print("\n=== Example 5: Batch Table Extraction ===")
+    
+    urls = [
+        "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
+        "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+        "https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
+    ]
+    
+    llm_config = LLMConfig(
+        provider="openai/gpt-4.1-mini",
+        api_token="env:OPENAI_API_KEY",
+        temperature=0.1,
+        max_tokens=1500
+    )
+    
+    table_strategy = LLMTableExtraction(
+        llm_config=llm_config,
+        css_selector="table.wikitable",  # Wikipedia data tables
+        verbose=False
+    )
+    
+    config = CrawlerRunConfig(
+        table_extraction=table_strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    all_tables = []
+    
+    async with AsyncWebCrawler() as crawler:
+        for url in urls:
+            print(f"\nProcessing: {url.split('/')[-1][:50]}...")
+            result = await crawler.arun(url=url, config=config)
+            
+            if result.success and result.tables:
+                print(f"  ✓ Found {len(result.tables)} tables")
+                # Store first table from each page
+                if result.tables:
+                    all_tables.append({
+                        'url': url,
+                        'table': result.tables[0]
+                    })
+    
+    # Summary
+    print(f"\n=== Summary ===")
+    print(f"Extracted {len(all_tables)} tables from {len(urls)} pages")
+    for item in all_tables:
+        table = item['table']
+        print(f"\nFrom {item['url'].split('/')[-1][:30]}:")
+        print(f"  Columns: {len(table['headers'])}")
+        print(f"  Rows: {len(table['rows'])}")
+
+
+async def main():
+    """Run all examples."""
+    print("=" * 60)
+    print("LLM TABLE EXTRACTION EXAMPLES")
+    print("=" * 60)
+    
+    # Run examples (comment out ones you don't want to run)
+    
+    # Basic extraction
+    await basic_llm_extraction()
+    
+    # # Focused extraction with CSS
+    # await focused_extraction()
+    
+    # # Compare strategies
+    # await compare_strategies()
+    
+    # # Local model (requires Ollama)
+    # # await local_model_extraction()
+    
+    # # Batch processing
+    # await batch_extraction()
+    
+    print("\n" + "=" * 60)
+    print("ALL EXAMPLES COMPLETED")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/table_extraction_example.py
+++ b/docs/examples/table_extraction_example.py
@@ -0,0 +1,276 @@
+"""
+Example: Using Table Extraction Strategies in Crawl4AI
+
+This example demonstrates how to use different table extraction strategies
+to extract tables from web pages.
+"""
+
+import asyncio
+import pandas as pd
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultTableExtraction,
+    NoTableExtraction,
+    TableExtractionStrategy
+)
+from typing import Dict, List, Any
+
+
+async def example_default_extraction():
+    """Example 1: Using default table extraction (automatic)."""
+    print("\n" + "="*50)
+    print("Example 1: Default Table Extraction")
+    print("="*50)
+    
+    async with AsyncWebCrawler() as crawler:
+        # No need to specify table_extraction - uses DefaultTableExtraction automatically
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_score_threshold=7  # Adjust sensitivity (default: 7)
+        )
+        
+        result = await crawler.arun(
+            "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+            config=config
+        )
+        
+        if result.success and result.tables:
+            print(f"Found {len(result.tables)} tables")
+            
+            # Convert first table to pandas DataFrame
+            if result.tables:
+                first_table = result.tables[0]
+                df = pd.DataFrame(
+                    first_table['rows'],
+                    columns=first_table['headers'] if first_table['headers'] else None
+                )
+                print(f"\nFirst table preview:")
+                print(df.head())
+                print(f"Shape: {df.shape}")
+
+
+async def example_custom_configuration():
+    """Example 2: Custom table extraction configuration."""
+    print("\n" + "="*50)
+    print("Example 2: Custom Table Configuration")
+    print("="*50)
+    
+    async with AsyncWebCrawler() as crawler:
+        # Create custom extraction strategy with specific settings
+        table_strategy = DefaultTableExtraction(
+            table_score_threshold=5,  # Lower threshold for more permissive detection
+            min_rows=3,  # Only extract tables with at least 3 rows
+            min_cols=2,  # Only extract tables with at least 2 columns
+            verbose=True
+        )
+        
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_extraction=table_strategy,
+            # Target specific tables using CSS selector
+            css_selector="div.main-content"
+        )
+        
+        result = await crawler.arun(
+            "https://example.com/data",
+            config=config
+        )
+        
+        if result.success:
+            print(f"Found {len(result.tables)} tables matching criteria")
+            
+            for i, table in enumerate(result.tables):
+                print(f"\nTable {i+1}:")
+                print(f"  Caption: {table.get('caption', 'No caption')}")
+                print(f"  Size: {table['metadata']['row_count']} rows × {table['metadata']['column_count']} columns")
+                print(f"  Has headers: {table['metadata']['has_headers']}")
+
+
+async def example_disable_extraction():
+    """Example 3: Disable table extraction when not needed."""
+    print("\n" + "="*50)
+    print("Example 3: Disable Table Extraction")
+    print("="*50)
+    
+    async with AsyncWebCrawler() as crawler:
+        # Use NoTableExtraction to skip table processing entirely
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_extraction=NoTableExtraction()  # No tables will be extracted
+        )
+        
+        result = await crawler.arun(
+            "https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            print(f"Tables extracted: {len(result.tables)} (should be 0)")
+            print("Table extraction disabled - better performance for non-table content")
+
+
+class FinancialTableExtraction(TableExtractionStrategy):
+    """
+    Custom strategy for extracting financial tables with specific requirements.
+    """
+    
+    def __init__(self, currency_symbols=None, **kwargs):
+        super().__init__(**kwargs)
+        self.currency_symbols = currency_symbols or ['$', '€', '£', '¥']
+    
+    def extract_tables(self, element, **kwargs):
+        """Extract only tables that appear to contain financial data."""
+        tables_data = []
+        
+        for table in element.xpath(".//table"):
+            # Check if table contains currency symbols
+            table_text = ''.join(table.itertext())
+            has_currency = any(symbol in table_text for symbol in self.currency_symbols)
+            
+            if not has_currency:
+                continue
+            
+            # Extract using base logic (could reuse DefaultTableExtraction logic)
+            headers = []
+            rows = []
+            
+            # Extract headers
+            for th in table.xpath(".//thead//th | .//tr[1]//th"):
+                headers.append(th.text_content().strip())
+            
+            # Extract rows
+            for tr in table.xpath(".//tbody//tr | .//tr[position()>1]"):
+                row = []
+                for td in tr.xpath(".//td"):
+                    cell_text = td.text_content().strip()
+                    # Clean currency values
+                    for symbol in self.currency_symbols:
+                        cell_text = cell_text.replace(symbol, '')
+                    row.append(cell_text)
+                if row:
+                    rows.append(row)
+            
+            if headers or rows:
+                tables_data.append({
+                    "headers": headers,
+                    "rows": rows,
+                    "caption": table.xpath(".//caption/text()")[0] if table.xpath(".//caption") else "",
+                    "summary": table.get("summary", ""),
+                    "metadata": {
+                        "type": "financial",
+                        "has_currency": True,
+                        "row_count": len(rows),
+                        "column_count": len(headers) if headers else len(rows[0]) if rows else 0
+                    }
+                })
+        
+        return tables_data
+
+
+async def example_custom_strategy():
+    """Example 4: Custom table extraction strategy."""
+    print("\n" + "="*50)
+    print("Example 4: Custom Financial Table Strategy")
+    print("="*50)
+    
+    async with AsyncWebCrawler() as crawler:
+        # Use custom strategy for financial tables
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_extraction=FinancialTableExtraction(
+                currency_symbols=['$', '€'],
+                verbose=True
+            )
+        )
+        
+        result = await crawler.arun(
+            "https://finance.yahoo.com/",
+            config=config
+        )
+        
+        if result.success:
+            print(f"Found {len(result.tables)} financial tables")
+            
+            for table in result.tables:
+                if table['metadata'].get('type') == 'financial':
+                    print(f"  ✓ Financial table with {table['metadata']['row_count']} rows")
+
+
+async def example_combined_extraction():
+    """Example 5: Combine table extraction with other strategies."""
+    print("\n" + "="*50)
+    print("Example 5: Combined Extraction Strategies")
+    print("="*50)
+    
+    from crawl4ai import LLMExtractionStrategy, LLMConfig
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define schema for structured extraction
+        schema = {
+            "type": "object",
+            "properties": {
+                "page_title": {"type": "string"},
+                "main_topic": {"type": "string"},
+                "key_figures": {
+                    "type": "array",
+                    "items": {"type": "string"}
+                }
+            }
+        }
+        
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            # Table extraction
+            table_extraction=DefaultTableExtraction(
+                table_score_threshold=6,
+                min_rows=2
+            ),
+            # LLM extraction for structured data
+            extraction_strategy=LLMExtractionStrategy(
+                llm_config=LLMConfig(provider="openai"),
+                schema=schema
+            )
+        )
+        
+        result = await crawler.arun(
+            "https://en.wikipedia.org/wiki/Economy_of_the_United_States",
+            config=config
+        )
+        
+        if result.success:
+            print(f"Tables found: {len(result.tables)}")
+            
+            # Tables are in result.tables
+            if result.tables:
+                print(f"First table has {len(result.tables[0]['rows'])} rows")
+            
+            # Structured data is in result.extracted_content
+            if result.extracted_content:
+                import json
+                structured_data = json.loads(result.extracted_content)
+                print(f"Page title: {structured_data.get('page_title', 'N/A')}")
+                print(f"Main topic: {structured_data.get('main_topic', 'N/A')}")
+
+
+async def main():
+    """Run all examples."""
+    print("\n" + "="*60)
+    print("CRAWL4AI TABLE EXTRACTION EXAMPLES")
+    print("="*60)
+    
+    # Run examples
+    await example_default_extraction()
+    await example_custom_configuration()
+    await example_disable_extraction()
+    await example_custom_strategy()
+    # await example_combined_extraction()  # Requires OpenAI API key
+    
+    print("\n" + "="*60)
+    print("EXAMPLES COMPLETED")
+    print("="*60)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())